On Fri, Jul 16, 2021 at 4:38 AM Richard Sandiford
<richard.sandif...@arm.com> wrote:
>
> "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:
> > 1. Rewrite builtin_memset_read_str and builtin_memset_gen_str with
> > vec_duplicate_optab to duplicate QI value to TI/OI/XI value.
> > 2. Add TARGET_GEN_MEMSET_SCRATCH_RTX to allow the backend to use a hard
> > scratch register to avoid stack realignment when expanding memset.
> >
> >       PR middle-end/90773
> >       * builtins.c (gen_memset_value_from_prev): New function.
> >       (gen_memset_broadcast): Likewise.
> >       (builtin_memset_read_str): Use gen_memset_value_from_prev
> >       and gen_memset_broadcast.
> >       (builtin_memset_gen_str): Likewise.
> >       * target.def (gen_memset_scratch_rtx): New hook.
> >       * doc/tm.texi.in: Add TARGET_GEN_MEMSET_SCRATCH_RTX.
> >       * doc/tm.texi: Regenerated.
> > ---
> >  gcc/builtins.c     | 123 +++++++++++++++++++++++++++++++++++++--------
> >  gcc/doc/tm.texi    |   5 ++
> >  gcc/doc/tm.texi.in |   2 +
> >  gcc/target.def     |   7 +++
> >  4 files changed, 116 insertions(+), 21 deletions(-)
> >
> > diff --git a/gcc/builtins.c b/gcc/builtins.c
> > index 39ab139b7e1..c1758ae2efc 100644
> > --- a/gcc/builtins.c
> > +++ b/gcc/builtins.c
> > @@ -6686,26 +6686,111 @@ expand_builtin_strncpy (tree exp, rtx target)
> >    return NULL_RTX;
> >  }
> >
> > -/* Callback routine for store_by_pieces.  Read GET_MODE_BITSIZE (MODE)
> > -   bytes from constant string DATA + OFFSET and return it as target
> > -   constant.  If PREV isn't nullptr, it has the RTL info from the
> > +/* Return the RTL of a register in MODE generated from PREV in the
> >     previous iteration.  */
> >
> > -rtx
> > -builtin_memset_read_str (void *data, void *prevp,
> > -                      HOST_WIDE_INT offset ATTRIBUTE_UNUSED,
> > -                      scalar_int_mode mode)
> > +static rtx
> > +gen_memset_value_from_prev (void *prevp, scalar_int_mode mode)
> >  {
> > +  rtx target = nullptr;
> >    by_pieces_prev *prev = (by_pieces_prev *) prevp;
> >    if (prev != nullptr && prev->data != nullptr)
> >      {
> >        /* Use the previous data in the same mode.  */
> >        if (prev->mode == mode)
> >       return prev->data;
> > +
> > +      rtx prev_rtx = prev->data;
> > +      machine_mode prev_mode = prev->mode;
> > +      unsigned int word_size = GET_MODE_SIZE (word_mode);
> > +      if (word_size < GET_MODE_SIZE (prev->mode)
> > +       && word_size > GET_MODE_SIZE (mode))
> > +     {
> > +       /* First generate subreg of word mode if the previous mode is
> > +          wider than word mode and word mode is wider than MODE.  */
> > +       prev_rtx = simplify_gen_subreg (word_mode, prev_rtx,
> > +                                       prev_mode, 0);
> > +       prev_mode = word_mode;
> > +     }
> > +      if (prev_rtx != nullptr)
> > +     target = simplify_gen_subreg (mode, prev_rtx, prev_mode, 0);
> >      }
> > +  return target;
> > +}
> > +
> > +/* Return the RTL of a register in MODE broadcasted from DATA.  */
> > +
> > +static rtx
> > +gen_memset_broadcast (rtx data, scalar_int_mode mode)
> > +{
> > +  /* Skip if regno_reg_rtx isn't initialized.  */
> > +  if (!regno_reg_rtx)
> > +    return nullptr;
> > +
> > +  rtx target = nullptr;
> > +
> > +  unsigned int nunits = GET_MODE_SIZE (mode) / GET_MODE_SIZE (QImode);
> > +  machine_mode vector_mode;
> > +  if (!mode_for_vector (QImode, nunits).exists (&vector_mode))
> > +    gcc_unreachable ();
>
> Sorry, I realise it's a bit late to be raising this objection now,
> but I don't think it's a good idea to use scalar integer modes as
> a proxy for vector modes.  In principle there's no reason why a
> target has to define an integer mode for every vector mode.

A target always defines the largest integer mode.

> If we want the mode to be a vector then I think the by-pieces
> infrastructure should be extended to support vectors directly,
> rather than assuming that each piece can be represented as
> a scalar_int_mode.
>

The current by-pieces infrastructure operates on scalar_int_mode.
Only for memset, there is

/* Callback routine for store_by_pieces.  Return the RTL of a register
   containing GET_MODE_SIZE (MODE) consecutive copies of the unsigned
   char value given in the RTL register data.  For example, if mode is
   4 bytes wide, return the RTL for 0x01010101*data.  If PREV isn't
   nullptr, it has the RTL info from the previous iteration.  */

static rtx
builtin_memset_gen_str (void *data, void *prevp,
                        HOST_WIDE_INT offset ATTRIBUTE_UNUSED,
                        scalar_int_mode mode)

It is a broadcast.  If a target can broadcast a byte to a wider integer,
can you suggest a way to use it in the current by-pieces infrastructure?

Thanks.

> Thanks,
> Richard
>
> > +
> > +  enum insn_code icode = optab_handler (vec_duplicate_optab,
> > +                                     vector_mode);
> > +  if (icode != CODE_FOR_nothing)
> > +    {
> > +      rtx reg = targetm.gen_memset_scratch_rtx (vector_mode);
> > +      if (CONST_INT_P (data))
> > +     {
> > +       /* Use the move expander with CONST_VECTOR.  */
> > +       rtx const_vec = gen_const_vec_duplicate (vector_mode, data);
> > +       emit_move_insn (reg, const_vec);
> > +     }
> > +      else
> > +     {
> > +
> > +       class expand_operand ops[2];
> > +       create_output_operand (&ops[0], reg, vector_mode);
> > +       create_input_operand (&ops[1], data, QImode);
> > +       expand_insn (icode, 2, ops);
> > +       if (!rtx_equal_p (reg, ops[0].value))
> > +         emit_move_insn (reg, ops[0].value);
> > +     }
> > +      target = lowpart_subreg (mode, reg, vector_mode);
> > +    }
> > +
> > +  return target;
> > +}
> > +
> > +/* Callback routine for store_by_pieces.  Read GET_MODE_BITSIZE (MODE)
> > +   bytes from constant string DATA + OFFSET and return it as target
> > +   constant.  If PREV isn't nullptr, it has the RTL info from the
> > +   previous iteration.  */
> >
> > +rtx
> > +builtin_memset_read_str (void *data, void *prev,
> > +                      HOST_WIDE_INT offset ATTRIBUTE_UNUSED,
> > +                      scalar_int_mode mode)
> > +{
> > +  rtx target;
> >    const char *c = (const char *) data;
> > -  char *p = XALLOCAVEC (char, GET_MODE_SIZE (mode));
> > +  char *p;
> > +
> > +  /* Don't use the previous value if size is 1.  */
> > +  if (GET_MODE_SIZE (mode) != 1)
> > +    {
> > +      target = gen_memset_value_from_prev (prev, mode);
> > +      if (target != nullptr)
> > +     return target;
> > +
> > +      p = XALLOCAVEC (char, GET_MODE_SIZE (QImode));
> > +      memset (p, *c, GET_MODE_SIZE (QImode));
> > +      rtx src = c_readstr (p, QImode);
> > +      target = gen_memset_broadcast (src, mode);
> > +      if (target != nullptr)
> > +     return target;
> > +    }
> > +
> > +  p = XALLOCAVEC (char, GET_MODE_SIZE (mode));
> >
> >    memset (p, *c, GET_MODE_SIZE (mode));
> >
> > @@ -6719,7 +6804,7 @@ builtin_memset_read_str (void *data, void *prevp,
> >     nullptr, it has the RTL info from the previous iteration.  */
> >
> >  static rtx
> > -builtin_memset_gen_str (void *data, void *prevp,
> > +builtin_memset_gen_str (void *data, void *prev,
> >                       HOST_WIDE_INT offset ATTRIBUTE_UNUSED,
> >                       scalar_int_mode mode)
> >  {
> > @@ -6727,22 +6812,18 @@ builtin_memset_gen_str (void *data, void *prevp,
> >    size_t size;
> >    char *p;
> >
> > -  by_pieces_prev *prev = (by_pieces_prev *) prevp;
> > -  if (prev != nullptr && prev->data != nullptr)
> > -    {
> > -      /* Use the previous data in the same mode.  */
> > -      if (prev->mode == mode)
> > -     return prev->data;
> > -
> > -      target = simplify_gen_subreg (mode, prev->data, prev->mode, 0);
> > -      if (target != nullptr)
> > -     return target;
> > -    }
> > -
> >    size = GET_MODE_SIZE (mode);
> >    if (size == 1)
> >      return (rtx) data;
> >
> > +  target = gen_memset_value_from_prev (prev, mode);
> > +  if (target != nullptr)
> > +    return target;
> > +
> > +  target = gen_memset_broadcast ((rtx) data, mode);
> > +  if (target != nullptr)
> > +    return target;
> > +
> >    p = XALLOCAVEC (char, size);
> >    memset (p, 1, size);
> >    coeff = c_readstr (p, mode);
> > diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
> > index 2a41ae5fba1..8849711fcf8 100644
> > --- a/gcc/doc/tm.texi
> > +++ b/gcc/doc/tm.texi
> > @@ -12122,6 +12122,11 @@ This function prepares to emit a conditional 
> > comparison within a sequence
> >   @var{bit_code} is @code{AND} or @code{IOR}, which is the op on the 
> > compares.
> >  @end deftypefn
> >
> > +@deftypefn {Target Hook} rtx TARGET_GEN_MEMSET_SCRATCH_RTX (machine_mode 
> > @var{mode})
> > +This hook should return an rtx for scratch register in @var{mode} to
> > +be used by memset broadcast. The default is @code{gen_reg_rtx}.
> > +@end deftypefn
> > +
> >  @deftypefn {Target Hook} unsigned TARGET_LOOP_UNROLL_ADJUST (unsigned 
> > @var{nunroll}, class loop *@var{loop})
> >  This target hook returns a new value for the number of times @var{loop}
> >  should be unrolled. The parameter @var{nunroll} is the number of times
> > diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
> > index f881cdabe9e..a6bbf4f2667 100644
> > --- a/gcc/doc/tm.texi.in
> > +++ b/gcc/doc/tm.texi.in
> > @@ -7958,6 +7958,8 @@ lists.
> >
> >  @hook TARGET_GEN_CCMP_NEXT
> >
> > +@hook TARGET_GEN_MEMSET_SCRATCH_RTX
> > +
> >  @hook TARGET_LOOP_UNROLL_ADJUST
> >
> >  @defmac POWI_MAX_MULTS
> > diff --git a/gcc/target.def b/gcc/target.def
> > index c009671c583..5fb287db3bd 100644
> > --- a/gcc/target.def
> > +++ b/gcc/target.def
> > @@ -2726,6 +2726,13 @@ DEFHOOK
> >   rtx, (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev, int cmp_code, 
> > tree op0, tree op1, int bit_code),
> >   NULL)
> >
> > +DEFHOOK
> > +(gen_memset_scratch_rtx,
> > + "This hook should return an rtx for scratch register in @var{mode} to\n\
> > +be used by memset broadcast. The default is @code{gen_reg_rtx}.",
> > + rtx, (machine_mode mode),
> > + gen_reg_rtx)
> > +
> >  /* Return a new value for loop unroll size.  */
> >  DEFHOOK
> >  (loop_unroll_adjust,



-- 
H.J.

Reply via email to