On Fri, Jul 16, 2021 at 4:38 AM Richard Sandiford
<[email protected]> wrote:
>
> "H.J. Lu via Gcc-patches" <[email protected]> writes:
> > 1. Rewrite builtin_memset_read_str and builtin_memset_gen_str with
> > vec_duplicate_optab to duplicate QI value to TI/OI/XI value.
> > 2. Add TARGET_GEN_MEMSET_SCRATCH_RTX to allow the backend to use a hard
> > scratch register to avoid stack realignment when expanding memset.
> >
> > PR middle-end/90773
> > * builtins.c (gen_memset_value_from_prev): New function.
> > (gen_memset_broadcast): Likewise.
> > (builtin_memset_read_str): Use gen_memset_value_from_prev
> > and gen_memset_broadcast.
> > (builtin_memset_gen_str): Likewise.
> > * target.def (gen_memset_scratch_rtx): New hook.
> > * doc/tm.texi.in: Add TARGET_GEN_MEMSET_SCRATCH_RTX.
> > * doc/tm.texi: Regenerated.
> > ---
> > gcc/builtins.c | 123 +++++++++++++++++++++++++++++++++++++--------
> > gcc/doc/tm.texi | 5 ++
> > gcc/doc/tm.texi.in | 2 +
> > gcc/target.def | 7 +++
> > 4 files changed, 116 insertions(+), 21 deletions(-)
> >
> > diff --git a/gcc/builtins.c b/gcc/builtins.c
> > index 39ab139b7e1..c1758ae2efc 100644
> > --- a/gcc/builtins.c
> > +++ b/gcc/builtins.c
> > @@ -6686,26 +6686,111 @@ expand_builtin_strncpy (tree exp, rtx target)
> > return NULL_RTX;
> > }
> >
> > -/* Callback routine for store_by_pieces. Read GET_MODE_BITSIZE (MODE)
> > - bytes from constant string DATA + OFFSET and return it as target
> > - constant. If PREV isn't nullptr, it has the RTL info from the
> > +/* Return the RTL of a register in MODE generated from PREV in the
> > previous iteration. */
> >
> > -rtx
> > -builtin_memset_read_str (void *data, void *prevp,
> > - HOST_WIDE_INT offset ATTRIBUTE_UNUSED,
> > - scalar_int_mode mode)
> > +static rtx
> > +gen_memset_value_from_prev (void *prevp, scalar_int_mode mode)
> > {
> > + rtx target = nullptr;
> > by_pieces_prev *prev = (by_pieces_prev *) prevp;
> > if (prev != nullptr && prev->data != nullptr)
> > {
> > /* Use the previous data in the same mode. */
> > if (prev->mode == mode)
> > return prev->data;
> > +
> > + rtx prev_rtx = prev->data;
> > + machine_mode prev_mode = prev->mode;
> > + unsigned int word_size = GET_MODE_SIZE (word_mode);
> > + if (word_size < GET_MODE_SIZE (prev->mode)
> > + && word_size > GET_MODE_SIZE (mode))
> > + {
> > + /* First generate subreg of word mode if the previous mode is
> > + wider than word mode and word mode is wider than MODE. */
> > + prev_rtx = simplify_gen_subreg (word_mode, prev_rtx,
> > + prev_mode, 0);
> > + prev_mode = word_mode;
> > + }
> > + if (prev_rtx != nullptr)
> > + target = simplify_gen_subreg (mode, prev_rtx, prev_mode, 0);
> > }
> > + return target;
> > +}
> > +
> > +/* Return the RTL of a register in MODE broadcasted from DATA. */
> > +
> > +static rtx
> > +gen_memset_broadcast (rtx data, scalar_int_mode mode)
> > +{
> > + /* Skip if regno_reg_rtx isn't initialized. */
> > + if (!regno_reg_rtx)
> > + return nullptr;
> > +
> > + rtx target = nullptr;
> > +
> > + unsigned int nunits = GET_MODE_SIZE (mode) / GET_MODE_SIZE (QImode);
> > + machine_mode vector_mode;
> > + if (!mode_for_vector (QImode, nunits).exists (&vector_mode))
> > + gcc_unreachable ();
>
> Sorry, I realise it's a bit late to be raising this objection now,
> but I don't think it's a good idea to use scalar integer modes as
> a proxy for vector modes. In principle there's no reason why a
> target has to define an integer mode for every vector mode.
A target always defines the largest integer mode.
> If we want the mode to be a vector then I think the by-pieces
> infrastructure should be extended to support vectors directly,
> rather than assuming that each piece can be represented as
> a scalar_int_mode.
>
The current by-pieces infrastructure operates on scalar_int_mode.
Only for memset, there is
/* Callback routine for store_by_pieces. Return the RTL of a register
containing GET_MODE_SIZE (MODE) consecutive copies of the unsigned
char value given in the RTL register data. For example, if mode is
4 bytes wide, return the RTL for 0x01010101*data. If PREV isn't
nullptr, it has the RTL info from the previous iteration. */
static rtx
builtin_memset_gen_str (void *data, void *prevp,
HOST_WIDE_INT offset ATTRIBUTE_UNUSED,
scalar_int_mode mode)
It is a broadcast. If a target can broadcast a byte to a wider integer,
can you suggest a way to use it in the current by-pieces infrastructure?
Thanks.
> Thanks,
> Richard
>
> > +
> > + enum insn_code icode = optab_handler (vec_duplicate_optab,
> > + vector_mode);
> > + if (icode != CODE_FOR_nothing)
> > + {
> > + rtx reg = targetm.gen_memset_scratch_rtx (vector_mode);
> > + if (CONST_INT_P (data))
> > + {
> > + /* Use the move expander with CONST_VECTOR. */
> > + rtx const_vec = gen_const_vec_duplicate (vector_mode, data);
> > + emit_move_insn (reg, const_vec);
> > + }
> > + else
> > + {
> > +
> > + class expand_operand ops[2];
> > + create_output_operand (&ops[0], reg, vector_mode);
> > + create_input_operand (&ops[1], data, QImode);
> > + expand_insn (icode, 2, ops);
> > + if (!rtx_equal_p (reg, ops[0].value))
> > + emit_move_insn (reg, ops[0].value);
> > + }
> > + target = lowpart_subreg (mode, reg, vector_mode);
> > + }
> > +
> > + return target;
> > +}
> > +
> > +/* Callback routine for store_by_pieces. Read GET_MODE_BITSIZE (MODE)
> > + bytes from constant string DATA + OFFSET and return it as target
> > + constant. If PREV isn't nullptr, it has the RTL info from the
> > + previous iteration. */
> >
> > +rtx
> > +builtin_memset_read_str (void *data, void *prev,
> > + HOST_WIDE_INT offset ATTRIBUTE_UNUSED,
> > + scalar_int_mode mode)
> > +{
> > + rtx target;
> > const char *c = (const char *) data;
> > - char *p = XALLOCAVEC (char, GET_MODE_SIZE (mode));
> > + char *p;
> > +
> > + /* Don't use the previous value if size is 1. */
> > + if (GET_MODE_SIZE (mode) != 1)
> > + {
> > + target = gen_memset_value_from_prev (prev, mode);
> > + if (target != nullptr)
> > + return target;
> > +
> > + p = XALLOCAVEC (char, GET_MODE_SIZE (QImode));
> > + memset (p, *c, GET_MODE_SIZE (QImode));
> > + rtx src = c_readstr (p, QImode);
> > + target = gen_memset_broadcast (src, mode);
> > + if (target != nullptr)
> > + return target;
> > + }
> > +
> > + p = XALLOCAVEC (char, GET_MODE_SIZE (mode));
> >
> > memset (p, *c, GET_MODE_SIZE (mode));
> >
> > @@ -6719,7 +6804,7 @@ builtin_memset_read_str (void *data, void *prevp,
> > nullptr, it has the RTL info from the previous iteration. */
> >
> > static rtx
> > -builtin_memset_gen_str (void *data, void *prevp,
> > +builtin_memset_gen_str (void *data, void *prev,
> > HOST_WIDE_INT offset ATTRIBUTE_UNUSED,
> > scalar_int_mode mode)
> > {
> > @@ -6727,22 +6812,18 @@ builtin_memset_gen_str (void *data, void *prevp,
> > size_t size;
> > char *p;
> >
> > - by_pieces_prev *prev = (by_pieces_prev *) prevp;
> > - if (prev != nullptr && prev->data != nullptr)
> > - {
> > - /* Use the previous data in the same mode. */
> > - if (prev->mode == mode)
> > - return prev->data;
> > -
> > - target = simplify_gen_subreg (mode, prev->data, prev->mode, 0);
> > - if (target != nullptr)
> > - return target;
> > - }
> > -
> > size = GET_MODE_SIZE (mode);
> > if (size == 1)
> > return (rtx) data;
> >
> > + target = gen_memset_value_from_prev (prev, mode);
> > + if (target != nullptr)
> > + return target;
> > +
> > + target = gen_memset_broadcast ((rtx) data, mode);
> > + if (target != nullptr)
> > + return target;
> > +
> > p = XALLOCAVEC (char, size);
> > memset (p, 1, size);
> > coeff = c_readstr (p, mode);
> > diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
> > index 2a41ae5fba1..8849711fcf8 100644
> > --- a/gcc/doc/tm.texi
> > +++ b/gcc/doc/tm.texi
> > @@ -12122,6 +12122,11 @@ This function prepares to emit a conditional
> > comparison within a sequence
> > @var{bit_code} is @code{AND} or @code{IOR}, which is the op on the
> > compares.
> > @end deftypefn
> >
> > +@deftypefn {Target Hook} rtx TARGET_GEN_MEMSET_SCRATCH_RTX (machine_mode
> > @var{mode})
> > +This hook should return an rtx for scratch register in @var{mode} to
> > +be used by memset broadcast. The default is @code{gen_reg_rtx}.
> > +@end deftypefn
> > +
> > @deftypefn {Target Hook} unsigned TARGET_LOOP_UNROLL_ADJUST (unsigned
> > @var{nunroll}, class loop *@var{loop})
> > This target hook returns a new value for the number of times @var{loop}
> > should be unrolled. The parameter @var{nunroll} is the number of times
> > diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
> > index f881cdabe9e..a6bbf4f2667 100644
> > --- a/gcc/doc/tm.texi.in
> > +++ b/gcc/doc/tm.texi.in
> > @@ -7958,6 +7958,8 @@ lists.
> >
> > @hook TARGET_GEN_CCMP_NEXT
> >
> > +@hook TARGET_GEN_MEMSET_SCRATCH_RTX
> > +
> > @hook TARGET_LOOP_UNROLL_ADJUST
> >
> > @defmac POWI_MAX_MULTS
> > diff --git a/gcc/target.def b/gcc/target.def
> > index c009671c583..5fb287db3bd 100644
> > --- a/gcc/target.def
> > +++ b/gcc/target.def
> > @@ -2726,6 +2726,13 @@ DEFHOOK
> > rtx, (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev, int cmp_code,
> > tree op0, tree op1, int bit_code),
> > NULL)
> >
> > +DEFHOOK
> > +(gen_memset_scratch_rtx,
> > + "This hook should return an rtx for scratch register in @var{mode} to\n\
> > +be used by memset broadcast. The default is @code{gen_reg_rtx}.",
> > + rtx, (machine_mode mode),
> > + gen_reg_rtx)
> > +
> > /* Return a new value for loop unroll size. */
> > DEFHOOK
> > (loop_unroll_adjust,
--
H.J.