On Fri, Jul 16, 2021 at 4:38 AM Richard Sandiford <richard.sandif...@arm.com> wrote: > > "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes: > > 1. Rewrite builtin_memset_read_str and builtin_memset_gen_str with > > vec_duplicate_optab to duplicate QI value to TI/OI/XI value. > > 2. Add TARGET_GEN_MEMSET_SCRATCH_RTX to allow the backend to use a hard > > scratch register to avoid stack realignment when expanding memset. > > > > PR middle-end/90773 > > * builtins.c (gen_memset_value_from_prev): New function. > > (gen_memset_broadcast): Likewise. > > (builtin_memset_read_str): Use gen_memset_value_from_prev > > and gen_memset_broadcast. > > (builtin_memset_gen_str): Likewise. > > * target.def (gen_memset_scratch_rtx): New hook. > > * doc/tm.texi.in: Add TARGET_GEN_MEMSET_SCRATCH_RTX. > > * doc/tm.texi: Regenerated. > > --- > > gcc/builtins.c | 123 +++++++++++++++++++++++++++++++++++++-------- > > gcc/doc/tm.texi | 5 ++ > > gcc/doc/tm.texi.in | 2 + > > gcc/target.def | 7 +++ > > 4 files changed, 116 insertions(+), 21 deletions(-) > > > > diff --git a/gcc/builtins.c b/gcc/builtins.c > > index 39ab139b7e1..c1758ae2efc 100644 > > --- a/gcc/builtins.c > > +++ b/gcc/builtins.c > > @@ -6686,26 +6686,111 @@ expand_builtin_strncpy (tree exp, rtx target) > > return NULL_RTX; > > } > > > > -/* Callback routine for store_by_pieces. Read GET_MODE_BITSIZE (MODE) > > - bytes from constant string DATA + OFFSET and return it as target > > - constant. If PREV isn't nullptr, it has the RTL info from the > > +/* Return the RTL of a register in MODE generated from PREV in the > > previous iteration. */ > > > > -rtx > > -builtin_memset_read_str (void *data, void *prevp, > > - HOST_WIDE_INT offset ATTRIBUTE_UNUSED, > > - scalar_int_mode mode) > > +static rtx > > +gen_memset_value_from_prev (void *prevp, scalar_int_mode mode) > > { > > + rtx target = nullptr; > > by_pieces_prev *prev = (by_pieces_prev *) prevp; > > if (prev != nullptr && prev->data != nullptr) > > { > > /* Use the previous data in the same mode. */ > > if (prev->mode == mode) > > return prev->data; > > + > > + rtx prev_rtx = prev->data; > > + machine_mode prev_mode = prev->mode; > > + unsigned int word_size = GET_MODE_SIZE (word_mode); > > + if (word_size < GET_MODE_SIZE (prev->mode) > > + && word_size > GET_MODE_SIZE (mode)) > > + { > > + /* First generate subreg of word mode if the previous mode is > > + wider than word mode and word mode is wider than MODE. */ > > + prev_rtx = simplify_gen_subreg (word_mode, prev_rtx, > > + prev_mode, 0); > > + prev_mode = word_mode; > > + } > > + if (prev_rtx != nullptr) > > + target = simplify_gen_subreg (mode, prev_rtx, prev_mode, 0); > > } > > + return target; > > +} > > + > > +/* Return the RTL of a register in MODE broadcasted from DATA. */ > > + > > +static rtx > > +gen_memset_broadcast (rtx data, scalar_int_mode mode) > > +{ > > + /* Skip if regno_reg_rtx isn't initialized. */ > > + if (!regno_reg_rtx) > > + return nullptr; > > + > > + rtx target = nullptr; > > + > > + unsigned int nunits = GET_MODE_SIZE (mode) / GET_MODE_SIZE (QImode); > > + machine_mode vector_mode; > > + if (!mode_for_vector (QImode, nunits).exists (&vector_mode)) > > + gcc_unreachable (); > > Sorry, I realise it's a bit late to be raising this objection now, > but I don't think it's a good idea to use scalar integer modes as > a proxy for vector modes. In principle there's no reason why a > target has to define an integer mode for every vector mode.
A target always defines the largest integer mode. > If we want the mode to be a vector then I think the by-pieces > infrastructure should be extended to support vectors directly, > rather than assuming that each piece can be represented as > a scalar_int_mode. > The current by-pieces infrastructure operates on scalar_int_mode. Only for memset, there is /* Callback routine for store_by_pieces. Return the RTL of a register containing GET_MODE_SIZE (MODE) consecutive copies of the unsigned char value given in the RTL register data. For example, if mode is 4 bytes wide, return the RTL for 0x01010101*data. If PREV isn't nullptr, it has the RTL info from the previous iteration. */ static rtx builtin_memset_gen_str (void *data, void *prevp, HOST_WIDE_INT offset ATTRIBUTE_UNUSED, scalar_int_mode mode) It is a broadcast. If a target can broadcast a byte to a wider integer, can you suggest a way to use it in the current by-pieces infrastructure? Thanks. > Thanks, > Richard > > > + > > + enum insn_code icode = optab_handler (vec_duplicate_optab, > > + vector_mode); > > + if (icode != CODE_FOR_nothing) > > + { > > + rtx reg = targetm.gen_memset_scratch_rtx (vector_mode); > > + if (CONST_INT_P (data)) > > + { > > + /* Use the move expander with CONST_VECTOR. */ > > + rtx const_vec = gen_const_vec_duplicate (vector_mode, data); > > + emit_move_insn (reg, const_vec); > > + } > > + else > > + { > > + > > + class expand_operand ops[2]; > > + create_output_operand (&ops[0], reg, vector_mode); > > + create_input_operand (&ops[1], data, QImode); > > + expand_insn (icode, 2, ops); > > + if (!rtx_equal_p (reg, ops[0].value)) > > + emit_move_insn (reg, ops[0].value); > > + } > > + target = lowpart_subreg (mode, reg, vector_mode); > > + } > > + > > + return target; > > +} > > + > > +/* Callback routine for store_by_pieces. Read GET_MODE_BITSIZE (MODE) > > + bytes from constant string DATA + OFFSET and return it as target > > + constant. If PREV isn't nullptr, it has the RTL info from the > > + previous iteration. */ > > > > +rtx > > +builtin_memset_read_str (void *data, void *prev, > > + HOST_WIDE_INT offset ATTRIBUTE_UNUSED, > > + scalar_int_mode mode) > > +{ > > + rtx target; > > const char *c = (const char *) data; > > - char *p = XALLOCAVEC (char, GET_MODE_SIZE (mode)); > > + char *p; > > + > > + /* Don't use the previous value if size is 1. */ > > + if (GET_MODE_SIZE (mode) != 1) > > + { > > + target = gen_memset_value_from_prev (prev, mode); > > + if (target != nullptr) > > + return target; > > + > > + p = XALLOCAVEC (char, GET_MODE_SIZE (QImode)); > > + memset (p, *c, GET_MODE_SIZE (QImode)); > > + rtx src = c_readstr (p, QImode); > > + target = gen_memset_broadcast (src, mode); > > + if (target != nullptr) > > + return target; > > + } > > + > > + p = XALLOCAVEC (char, GET_MODE_SIZE (mode)); > > > > memset (p, *c, GET_MODE_SIZE (mode)); > > > > @@ -6719,7 +6804,7 @@ builtin_memset_read_str (void *data, void *prevp, > > nullptr, it has the RTL info from the previous iteration. */ > > > > static rtx > > -builtin_memset_gen_str (void *data, void *prevp, > > +builtin_memset_gen_str (void *data, void *prev, > > HOST_WIDE_INT offset ATTRIBUTE_UNUSED, > > scalar_int_mode mode) > > { > > @@ -6727,22 +6812,18 @@ builtin_memset_gen_str (void *data, void *prevp, > > size_t size; > > char *p; > > > > - by_pieces_prev *prev = (by_pieces_prev *) prevp; > > - if (prev != nullptr && prev->data != nullptr) > > - { > > - /* Use the previous data in the same mode. */ > > - if (prev->mode == mode) > > - return prev->data; > > - > > - target = simplify_gen_subreg (mode, prev->data, prev->mode, 0); > > - if (target != nullptr) > > - return target; > > - } > > - > > size = GET_MODE_SIZE (mode); > > if (size == 1) > > return (rtx) data; > > > > + target = gen_memset_value_from_prev (prev, mode); > > + if (target != nullptr) > > + return target; > > + > > + target = gen_memset_broadcast ((rtx) data, mode); > > + if (target != nullptr) > > + return target; > > + > > p = XALLOCAVEC (char, size); > > memset (p, 1, size); > > coeff = c_readstr (p, mode); > > diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi > > index 2a41ae5fba1..8849711fcf8 100644 > > --- a/gcc/doc/tm.texi > > +++ b/gcc/doc/tm.texi > > @@ -12122,6 +12122,11 @@ This function prepares to emit a conditional > > comparison within a sequence > > @var{bit_code} is @code{AND} or @code{IOR}, which is the op on the > > compares. > > @end deftypefn > > > > +@deftypefn {Target Hook} rtx TARGET_GEN_MEMSET_SCRATCH_RTX (machine_mode > > @var{mode}) > > +This hook should return an rtx for scratch register in @var{mode} to > > +be used by memset broadcast. The default is @code{gen_reg_rtx}. > > +@end deftypefn > > + > > @deftypefn {Target Hook} unsigned TARGET_LOOP_UNROLL_ADJUST (unsigned > > @var{nunroll}, class loop *@var{loop}) > > This target hook returns a new value for the number of times @var{loop} > > should be unrolled. The parameter @var{nunroll} is the number of times > > diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in > > index f881cdabe9e..a6bbf4f2667 100644 > > --- a/gcc/doc/tm.texi.in > > +++ b/gcc/doc/tm.texi.in > > @@ -7958,6 +7958,8 @@ lists. > > > > @hook TARGET_GEN_CCMP_NEXT > > > > +@hook TARGET_GEN_MEMSET_SCRATCH_RTX > > + > > @hook TARGET_LOOP_UNROLL_ADJUST > > > > @defmac POWI_MAX_MULTS > > diff --git a/gcc/target.def b/gcc/target.def > > index c009671c583..5fb287db3bd 100644 > > --- a/gcc/target.def > > +++ b/gcc/target.def > > @@ -2726,6 +2726,13 @@ DEFHOOK > > rtx, (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev, int cmp_code, > > tree op0, tree op1, int bit_code), > > NULL) > > > > +DEFHOOK > > +(gen_memset_scratch_rtx, > > + "This hook should return an rtx for scratch register in @var{mode} to\n\ > > +be used by memset broadcast. The default is @code{gen_reg_rtx}.", > > + rtx, (machine_mode mode), > > + gen_reg_rtx) > > + > > /* Return a new value for loop unroll size. */ > > DEFHOOK > > (loop_unroll_adjust, -- H.J.