On Mon, Jul 19, 2021 at 7:41 AM Richard Sandiford
<richard.sandif...@arm.com> wrote:
>
> "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:
> > On Fri, Jul 16, 2021 at 7:15 AM H.J. Lu <hjl.to...@gmail.com> wrote:
> >>
> >> On Fri, Jul 16, 2021 at 6:24 AM Richard Sandiford
> >> <richard.sandif...@arm.com> wrote:
> >> >
> >> > "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:
> >> > > On Fri, Jul 16, 2021 at 4:38 AM Richard Sandiford
> >> > > <richard.sandif...@arm.com> wrote:
> >> > >>
> >> > >> "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:
> >> > >> > 1. Rewrite builtin_memset_read_str and builtin_memset_gen_str with
> >> > >> > vec_duplicate_optab to duplicate QI value to TI/OI/XI value.
> >> > >> > 2. Add TARGET_GEN_MEMSET_SCRATCH_RTX to allow the backend to use a 
> >> > >> > hard
> >> > >> > scratch register to avoid stack realignment when expanding memset.
> >> > >> >
> >> > >> >       PR middle-end/90773
> >> > >> >       * builtins.c (gen_memset_value_from_prev): New function.
> >> > >> >       (gen_memset_broadcast): Likewise.
> >> > >> >       (builtin_memset_read_str): Use gen_memset_value_from_prev
> >> > >> >       and gen_memset_broadcast.
> >> > >> >       (builtin_memset_gen_str): Likewise.
> >> > >> >       * target.def (gen_memset_scratch_rtx): New hook.
> >> > >> >       * doc/tm.texi.in: Add TARGET_GEN_MEMSET_SCRATCH_RTX.
> >> > >> >       * doc/tm.texi: Regenerated.
> >> > >> > ---
> >> > >> >  gcc/builtins.c     | 123 
> >> > >> > +++++++++++++++++++++++++++++++++++++--------
> >> > >> >  gcc/doc/tm.texi    |   5 ++
> >> > >> >  gcc/doc/tm.texi.in |   2 +
> >> > >> >  gcc/target.def     |   7 +++
> >> > >> >  4 files changed, 116 insertions(+), 21 deletions(-)
> >> > >> >
> >> > >> > diff --git a/gcc/builtins.c b/gcc/builtins.c
> >> > >> > index 39ab139b7e1..c1758ae2efc 100644
> >> > >> > --- a/gcc/builtins.c
> >> > >> > +++ b/gcc/builtins.c
> >> > >> > @@ -6686,26 +6686,111 @@ expand_builtin_strncpy (tree exp, rtx 
> >> > >> > target)
> >> > >> >    return NULL_RTX;
> >> > >> >  }
> >> > >> >
> >> > >> > -/* Callback routine for store_by_pieces.  Read GET_MODE_BITSIZE 
> >> > >> > (MODE)
> >> > >> > -   bytes from constant string DATA + OFFSET and return it as target
> >> > >> > -   constant.  If PREV isn't nullptr, it has the RTL info from the
> >> > >> > +/* Return the RTL of a register in MODE generated from PREV in the
> >> > >> >     previous iteration.  */
> >> > >> >
> >> > >> > -rtx
> >> > >> > -builtin_memset_read_str (void *data, void *prevp,
> >> > >> > -                      HOST_WIDE_INT offset ATTRIBUTE_UNUSED,
> >> > >> > -                      scalar_int_mode mode)
> >> > >> > +static rtx
> >> > >> > +gen_memset_value_from_prev (void *prevp, scalar_int_mode mode)
> >> > >> >  {
> >> > >> > +  rtx target = nullptr;
> >> > >> >    by_pieces_prev *prev = (by_pieces_prev *) prevp;
> >> > >> >    if (prev != nullptr && prev->data != nullptr)
> >> > >> >      {
> >> > >> >        /* Use the previous data in the same mode.  */
> >> > >> >        if (prev->mode == mode)
> >> > >> >       return prev->data;
> >> > >> > +
> >> > >> > +      rtx prev_rtx = prev->data;
> >> > >> > +      machine_mode prev_mode = prev->mode;
> >> > >> > +      unsigned int word_size = GET_MODE_SIZE (word_mode);
> >> > >> > +      if (word_size < GET_MODE_SIZE (prev->mode)
> >> > >> > +       && word_size > GET_MODE_SIZE (mode))
> >> > >> > +     {
> >> > >> > +       /* First generate subreg of word mode if the previous mode 
> >> > >> > is
> >> > >> > +          wider than word mode and word mode is wider than MODE.  
> >> > >> > */
> >> > >> > +       prev_rtx = simplify_gen_subreg (word_mode, prev_rtx,
> >> > >> > +                                       prev_mode, 0);
> >> > >> > +       prev_mode = word_mode;
> >> > >> > +     }
> >> > >> > +      if (prev_rtx != nullptr)
> >> > >> > +     target = simplify_gen_subreg (mode, prev_rtx, prev_mode, 0);
> >> > >> >      }
> >> > >> > +  return target;
> >> > >> > +}
> >> > >> > +
> >> > >> > +/* Return the RTL of a register in MODE broadcasted from DATA.  */
> >> > >> > +
> >> > >> > +static rtx
> >> > >> > +gen_memset_broadcast (rtx data, scalar_int_mode mode)
> >> > >> > +{
> >> > >> > +  /* Skip if regno_reg_rtx isn't initialized.  */
> >> > >> > +  if (!regno_reg_rtx)
> >> > >> > +    return nullptr;
> >> > >> > +
> >> > >> > +  rtx target = nullptr;
> >> > >> > +
> >> > >> > +  unsigned int nunits = GET_MODE_SIZE (mode) / GET_MODE_SIZE 
> >> > >> > (QImode);
> >> > >> > +  machine_mode vector_mode;
> >> > >> > +  if (!mode_for_vector (QImode, nunits).exists (&vector_mode))
> >> > >> > +    gcc_unreachable ();
> >> > >>
> >> > >> Sorry, I realise it's a bit late to be raising this objection now,
> >> > >> but I don't think it's a good idea to use scalar integer modes as
> >> > >> a proxy for vector modes.  In principle there's no reason why a
> >> > >> target has to define an integer mode for every vector mode.
> >> > >
> >> > > A target always defines the largest integer mode.
> >> >
> >> > Right.  But a target shouldn't *need* to define an integer mode
> >> > for every vector mode.
> >> >
> >> > >> If we want the mode to be a vector then I think the by-pieces
> >> > >> infrastructure should be extended to support vectors directly,
> >> > >> rather than assuming that each piece can be represented as
> >> > >> a scalar_int_mode.
> >> > >>
> >> > >
> >> > > The current by-pieces infrastructure operates on scalar_int_mode.
> >> > > Only for memset, there is
> >> > >
> >> > > /* Callback routine for store_by_pieces.  Return the RTL of a register
> >> > >    containing GET_MODE_SIZE (MODE) consecutive copies of the unsigned
> >> > >    char value given in the RTL register data.  For example, if mode is
> >> > >    4 bytes wide, return the RTL for 0x01010101*data.  If PREV isn't
> >> > >    nullptr, it has the RTL info from the previous iteration.  */
> >> > >
> >> > > static rtx
> >> > > builtin_memset_gen_str (void *data, void *prevp,
> >> > >                         HOST_WIDE_INT offset ATTRIBUTE_UNUSED,
> >> > >                         scalar_int_mode mode)
> >> > >
> >> > > It is a broadcast.  If a target can broadcast a byte to a wider 
> >> > > integer,
> >> > > can you suggest a way to use it in the current by-pieces 
> >> > > infrastructure?
> >> >
> >> > I meant that we should extend the by-pieces infrastructure so that
> >> > it no longer requires scalar_int_mode, rather than work within the
> >> > current limits.
> >> >
> >> > IMO the fact that we're (legitimately) going through vec_duplicate_optab
> >>
> >> If vec_duplicate_optab is the only blocking issue, can we add an integer
> >> broadcast optab?  I want to avoid a major surgery just because we
> >> don't support integer broadcast from byte.
> >>
> >> > shows that we really are dealing with vectors here, not integers.
> >> >
> >>
> >> Are you suggesting we should add the optional vector mode
> >> support to by-pieces for memset?
> >>
> >
> > Like this?
> >
> > 1. Replace scalar_int_mode with machine_mode in the by-pieces
> > infrastructure to allow non-integer mode.
>
> I guess it might be better to use fixed_size_mode instead of
> machine_mode.

Fixed.

> > 2. Rename widest_int_mode_for_size to widest_int_or_QI_vector_mode_for_size
> > to return QI vector mode for memset.
> > 3. Add op_by_pieces_d::smallest_mode_for_size to return the smallest
> > integer or QI vector mode.
> >
> > Thanks.
> >
> > --
> > H.J.
> >
> > From 171f0a919300a8aac58dfe5fc9904eaa8fe0a05b Mon Sep 17 00:00:00 2001
> > From: "H.J. Lu" <hjl.to...@gmail.com>
> > Date: Sun, 6 Mar 2016 06:38:21 -0800
> > Subject: [PATCH] Add QI vector mode support to by-pieces for memset
> >
> > 1. Replace scalar_int_mode with machine_mode in the by-pieces
> > infrastructure to allow non-integer mode.
> > 2. Rename widest_int_mode_for_size to widest_int_or_QI_vector_mode_for_size
> > to return QI vector mode for memset.
> > 3. Add op_by_pieces_d::smallest_mode_for_size to return the smallest
> > integer or QI vector mode.
> >
> > gcc/
> >
> >       PR middle-end/90773
> >       * builtins.c (builtin_memcpy_read_str): Change the mode argument
> >       from scalar_int_mode to machine_mode.
> >       (builtin_strncpy_read_str): Likewise.
> >       (gen_memset_value_from_prev): New function.
> >       (gen_memset_broadcast): Likewise.
> >       (builtin_memset_read_str): Change the mode argument from
> >       scalar_int_mode to machine_mode.  Use gen_memset_value_from_prev
> >       and gen_memset_broadcast.
> >       (builtin_memset_gen_str): Likewise.
> >       (try_store_by_multiple_pieces): Use by_pieces_constfn to declare
> >       constfun.
> >       * builtins.h (builtin_strncpy_read_str): Updated.
> >       (builtin_memset_read_str): Likewise.
> >       * expr.c (widest_int_mode_for_size): Renamed to ...
> >       (widest_int_or_QI_vector_mode_for_size): Add a bool argument to
> >       indicate if QI vector mode can be used.
> >       (by_pieces_ninsns): Call widest_int_or_QI_vector_mode_for_size
> >       instead of widest_int_mode_for_size.
> >       (pieces_addr::adjust): Change the mode argument from
> >       scalar_int_mode to machine_mode.
> >       (op_by_pieces_d): Add a bool member, m_qi_vector_mode, to
> >       indicate that QI vector mode can be used.
> >       (op_by_pieces_d::op_by_pieces_d): Add a bool argument to initialize
> >       m_qi_vector_mode.  Call widest_int_or_QI_vector_mode_for_size
> >       instead of widest_int_mode_for_size.
> >       (op_by_pieces_d::get_usable_mode): Change the mode argument from
> >       scalar_int_mode to machine_mode.  Call
> >       widest_int_or_QI_vector_mode_for_size instead of
> >       widest_int_mode_for_size.
> >       (op_by_pieces_d::smallest_mode_for_size): New member function
> >       to return the smallest integer or QI vector mode.
> >       (op_by_pieces_d::run): Call widest_int_or_QI_vector_mode_for_size
> >       instead of widest_int_mode_for_size.  Call smallest_mode_for_size
> >       instead of smallest_int_mode_for_size.
> >       (store_by_pieces_d::store_by_pieces_d): Add a bool argument to
> >       indicate that QI vector mode can be used and pass it to
> >       op_by_pieces_d::op_by_pieces_d.
> >       (can_store_by_pieces): Call widest_int_or_QI_vector_mode_for_size
> >       instead of widest_int_mode_for_size.
> >       (store_by_pieces::store_by_pieces): Pass memsetp to
> >       store_by_pieces_d::store_by_pieces_d.
> >       (clear_by_pieces_1): Change scalar_int_mode to machine_mode.
> >       (string_cst_read_str): Change the mode argument from
> >       scalar_int_mode to machine_mode.
> >       * expr.h (by_pieces_constfn): Change scalar_int_mode to
> >       machine_mode.
> >       (by_pieces_prev): Likewise.
> >       * target.def (gen_memset_scratch_rtx): New hook.
> >       * doc/tm.texi.in: Add TARGET_GEN_MEMSET_SCRATCH_RTX.
> >       * doc/tm.texi: Regenerated.
> >
> > gcc/testsuite/
> >
> >       * gcc.target/i386/pr100865-3.c: Expect vmovdqu8 instead of
> >       vmovdqu.
> >       * gcc.target/i386/pr100865-4b.c: Likewise.
> > ---
> >  gcc/builtins.c                              | 138 ++++++++++++++-----
> >  gcc/builtins.h                              |   4 +-
> >  gcc/doc/tm.texi                             |   5 +
> >  gcc/doc/tm.texi.in                          |   2 +
> >  gcc/expr.c                                  | 145 ++++++++++++++------
> >  gcc/expr.h                                  |   4 +-
> >  gcc/target.def                              |   7 +
> >  gcc/testsuite/gcc.target/i386/pr100865-3.c  |   2 +-
> >  gcc/testsuite/gcc.target/i386/pr100865-4b.c |   2 +-
> >  9 files changed, 229 insertions(+), 80 deletions(-)
> >
> > diff --git a/gcc/builtins.c b/gcc/builtins.c
> > index 39ab139b7e1..876378b467c 100644
> > --- a/gcc/builtins.c
> > +++ b/gcc/builtins.c
> > @@ -3890,13 +3890,14 @@ expand_builtin_strnlen (tree exp, rtx target, 
> > machine_mode target_mode)
> >
> >  static rtx
> >  builtin_memcpy_read_str (void *data, void *, HOST_WIDE_INT offset,
> > -                      scalar_int_mode mode)
> > +                      machine_mode mode)
> >  {
> >    /* The REPresentation pointed to by DATA need not be a nul-terminated
> >       string but the caller guarantees it's large enough for MODE.  */
> >    const char *rep = (const char *) data;
> >
> > -  return c_readstr (rep + offset, mode, /*nul_terminated=*/false);
> > +  return c_readstr (rep + offset, as_a <scalar_int_mode> (mode),
> > +                 /*nul_terminated=*/false);
> >  }
> >
> >  /* LEN specify length of the block of memcpy/memset operation.
> > @@ -6478,14 +6479,14 @@ expand_builtin_stpncpy (tree exp, rtx)
> >
> >  rtx
> >  builtin_strncpy_read_str (void *data, void *, HOST_WIDE_INT offset,
> > -                       scalar_int_mode mode)
> > +                       machine_mode mode)
> >  {
> >    const char *str = (const char *) data;
> >
> >    if ((unsigned HOST_WIDE_INT) offset > strlen (str))
> >      return const0_rtx;
> >
> > -  return c_readstr (str + offset, mode);
> > +  return c_readstr (str + offset, as_a <scalar_int_mode> (mode));
> >  }
> >  /* Helper to check the sizes of sequences and the destination of calls
>
> I think both of these as_a<scalar_int_mode>s should have a comment
> explaining that we don't (yet?) consider using vector modes for this
> operation.

Fixed.

> > @@ -6686,30 +6687,109 @@ expand_builtin_strncpy (tree exp, rtx target)
> >    return NULL_RTX;
> >  }
> >
> > -/* Callback routine for store_by_pieces.  Read GET_MODE_BITSIZE (MODE)
> > -   bytes from constant string DATA + OFFSET and return it as target
> > -   constant.  If PREV isn't nullptr, it has the RTL info from the
> > +/* Return the RTL of a register in MODE generated from PREV in the
> >     previous iteration.  */
> >
> > -rtx
> > -builtin_memset_read_str (void *data, void *prevp,
> > -                      HOST_WIDE_INT offset ATTRIBUTE_UNUSED,
> > -                      scalar_int_mode mode)
> > +static rtx
> > +gen_memset_value_from_prev (void *prevp, machine_mode mode)
> >  {
> > +  rtx target = nullptr;
> >    by_pieces_prev *prev = (by_pieces_prev *) prevp;
>
> I think it'd be better to pass the proper type instead, since this
> function isn't directly implementing a callback interface.

Fixed.

> >    if (prev != nullptr && prev->data != nullptr)
> >      {
> >        /* Use the previous data in the same mode.  */
> >        if (prev->mode == mode)
> >       return prev->data;
> > +
> > +      rtx prev_rtx = prev->data;
> > +      machine_mode prev_mode = prev->mode;
> > +      unsigned int word_size = GET_MODE_SIZE (word_mode);
> > +      if (word_size < GET_MODE_SIZE (prev->mode).to_constant ()
> > +       && word_size > GET_MODE_SIZE (mode).to_constant ())
>
> Using fixed_size_mode would also avoid the to_constants here.

Fixed.

> UNITS_PER_WORD is more direct/canonical than GET_MODE_SIZE (word_mode).

Fixed.

> > +     {
> > +       /* First generate subreg of word mode if the previous mode is
> > +          wider than word mode and word mode is wider than MODE.  */
> > +       prev_rtx = simplify_gen_subreg (word_mode, prev_rtx,
> > +                                       prev_mode, 0);
> > +       prev_mode = word_mode;
> > +     }
> > +      if (prev_rtx != nullptr)
> > +     target = simplify_gen_subreg (mode, prev_rtx, prev_mode, 0);
>
> This should be lowpart_subreg, since 0 isn't the right offset for
> big-endian targets.  Using lowpart_subreg should also avoid the need
> for the word_size “if” above: lowpart_subreg can handle lowpart subword
> subregs of multiword values.

I tried it.  It didn't work since it caused the LRA failure.   I replaced
simplify_gen_subreg with lowpart_subreg instead.

> > +    }
> > +  return target;
> > +}
> > +
> > +/* Return the RTL of a register in MODE broadcasted from DATA.  */
> > +
> > +static rtx
> > +gen_memset_broadcast (rtx data, machine_mode mode)
> > +{
> > +  /* Skip if regno_reg_rtx isn't initialized or not vector mode.  */
>
> Why's the regno_reg_rtx condition needed?

Removed.

> > +  if (!regno_reg_rtx || !VECTOR_MODE_P (mode))
> > +    return nullptr;
> > +
> > +  gcc_assert (GET_MODE_INNER (mode) == QImode);
> > +
> > +  enum insn_code icode = optab_handler (vec_duplicate_optab, mode);
> > +
> > +  gcc_assert (icode != CODE_FOR_nothing);
>
> Might be worth a comment here saying that having vec_duplicate_optab is
> a precondition for picking a vector mode.

I added a comment.

> > +
> > +  rtx target = targetm.gen_memset_scratch_rtx (mode);
> > +  if (CONST_INT_P (data))
> > +    {
> > +      /* Use the move expander with CONST_VECTOR.  */
> > +      rtx const_vec = gen_const_vec_duplicate (mode, data);
> > +      emit_move_insn (target, const_vec);
> > +    }
> > +  else
> > +    {
> > +
>
> Nit: excess blank line.

Fixed.

> > +      class expand_operand ops[2];
> > +      create_output_operand (&ops[0], target, mode);
> > +      create_input_operand (&ops[1], data, QImode);
> > +      expand_insn (icode, 2, ops);
> > +      if (!rtx_equal_p (target, ops[0].value))
> > +     emit_move_insn (target, ops[0].value);
> >      }
> >
> > +  return target;
> > +}
> > +
> > +/* Callback routine for store_by_pieces.  Read GET_MODE_BITSIZE (MODE)
> > +   bytes from constant string DATA + OFFSET and return it as target
> > +   constant.  If PREV isn't nullptr, it has the RTL info from the
> > +   previous iteration.  */
> > +
> > +rtx
> > +builtin_memset_read_str (void *data, void *prev,
> > +                      HOST_WIDE_INT offset ATTRIBUTE_UNUSED,
> > +                      machine_mode mode)
> > +{
> > +  rtx target;
> >    const char *c = (const char *) data;
> > -  char *p = XALLOCAVEC (char, GET_MODE_SIZE (mode));
> > +  char *p;
> > +  unsigned int size = GET_MODE_SIZE (mode).to_constant ();
> > +
> > +  /* Don't use the previous value if size is 1.  */
>
> Why not though?  The existing code does, and that seems like the right
> thing to do when operating on integers.
>
> I can see the check would be a good thing to do if MODE isn't a vector
> mode and the previous mode was.  Is that the case you're worried about?
> If so, that check could go in gen_memset_value_from_prev instead.

We are storing one byte.  Doing it directly is faster.

> > +  if (size != 1)
> > +    {
> > +      target = gen_memset_value_from_prev (prev, mode);
> > +      if (target != nullptr)
> > +     return target;
> > +
> > +      p = XALLOCAVEC (char, GET_MODE_SIZE (QImode));
> > +      memset (p, *c, GET_MODE_SIZE (QImode));
> > +      rtx src = c_readstr (p, QImode);
>
> This seems unnecessarily complicated.  Couldn't it just be
>
>   gen_int_mode (QImode, *c);
>
> instead?  There are no endianness concerns for single QI values.

Fixed.

> > +      target = gen_memset_broadcast (src, mode);
> > +      if (target != nullptr)
> > +     return target;
> > +    }
> > +
> > +  p = XALLOCAVEC (char, size);
> >
> > -  memset (p, *c, GET_MODE_SIZE (mode));
> > +  memset (p, *c, size);
> >
> > -  return c_readstr (p, mode);
> > +  return c_readstr (p, as_a <scalar_int_mode> (mode));
> >  }
> >
> >  /* Callback routine for store_by_pieces.  Return the RTL of a register
> > @@ -6719,33 +6799,29 @@ builtin_memset_read_str (void *data, void *prevp,
> >     nullptr, it has the RTL info from the previous iteration.  */
> >
> >  static rtx
> > -builtin_memset_gen_str (void *data, void *prevp,
> > +builtin_memset_gen_str (void *data, void *prev,
> >                       HOST_WIDE_INT offset ATTRIBUTE_UNUSED,
> > -                     scalar_int_mode mode)
> > +                     machine_mode mode)
> >  {
> >    rtx target, coeff;
> >    size_t size;
> >    char *p;
> >
> > -  by_pieces_prev *prev = (by_pieces_prev *) prevp;
> > -  if (prev != nullptr && prev->data != nullptr)
> > -    {
> > -      /* Use the previous data in the same mode.  */
> > -      if (prev->mode == mode)
> > -     return prev->data;
> > -
> > -      target = simplify_gen_subreg (mode, prev->data, prev->mode, 0);
> > -      if (target != nullptr)
> > -     return target;
> > -    }
> > -
> > -  size = GET_MODE_SIZE (mode);
> > +  size = GET_MODE_SIZE (mode).to_constant ();
> >    if (size == 1)
> >      return (rtx) data;
> >
> > +  target = gen_memset_value_from_prev (prev, mode);
> > +  if (target != nullptr)
> > +    return target;
> > +
> > +  target = gen_memset_broadcast ((rtx) data, mode);
> > +  if (target != nullptr)
> > +    return target;
> > +
> >    p = XALLOCAVEC (char, size);
> >    memset (p, 1, size);
> > -  coeff = c_readstr (p, mode);
> > +  coeff = c_readstr (p, as_a <scalar_int_mode> (mode));
> >
> >    target = convert_to_mode (mode, (rtx) data, 1);
> >    target = expand_mult (mode, target, coeff, NULL_RTX, 1);
> > @@ -6849,7 +6925,7 @@ try_store_by_multiple_pieces (rtx to, rtx len, 
> > unsigned int ctz_len,
> >                           &valc, align, true))
> >      return false;
> >
> > -  rtx (*constfun) (void *, void *, HOST_WIDE_INT, scalar_int_mode);
> > +  by_pieces_constfn constfun;
> >    void *constfundata;
> >    if (val)
> >      {
> > diff --git a/gcc/builtins.h b/gcc/builtins.h
> > index a64ece3f1cd..31d07621750 100644
> > --- a/gcc/builtins.h
> > +++ b/gcc/builtins.h
> > @@ -111,9 +111,9 @@ extern tree mathfn_built_in (tree, enum 
> > built_in_function fn);
> >  extern tree mathfn_built_in (tree, combined_fn);
> >  extern tree mathfn_built_in_type (combined_fn);
> >  extern rtx builtin_strncpy_read_str (void *, void *, HOST_WIDE_INT,
> > -                                  scalar_int_mode);
> > +                                  machine_mode);
> >  extern rtx builtin_memset_read_str (void *, void *, HOST_WIDE_INT,
> > -                                 scalar_int_mode);
> > +                                 machine_mode);
> >  extern rtx expand_builtin_saveregs (void);
> >  extern tree std_build_builtin_va_list (void);
> >  extern tree std_fn_abi_va_list (tree);
> > diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
> > index 3ad39443eba..c1995270720 100644
> > --- a/gcc/doc/tm.texi
> > +++ b/gcc/doc/tm.texi
> > @@ -12123,6 +12123,11 @@ This function prepares to emit a conditional 
> > comparison within a sequence
> >   @var{bit_code} is @code{AND} or @code{IOR}, which is the op on the 
> > compares.
> >  @end deftypefn
> >
> > +@deftypefn {Target Hook} rtx TARGET_GEN_MEMSET_SCRATCH_RTX (machine_mode 
> > @var{mode})
> > +This hook should return an rtx for scratch register in @var{mode} to
> > +be used by memset broadcast. The default is @code{gen_reg_rtx}.
>
> This should give a hint why the default might not be appropriate.

Updated.

> > +@end deftypefn
> > +
> >  @deftypefn {Target Hook} unsigned TARGET_LOOP_UNROLL_ADJUST (unsigned 
> > @var{nunroll}, class loop *@var{loop})
> >  This target hook returns a new value for the number of times @var{loop}
> >  should be unrolled. The parameter @var{nunroll} is the number of times
> > diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
> > index f881cdabe9e..a6bbf4f2667 100644
> > --- a/gcc/doc/tm.texi.in
> > +++ b/gcc/doc/tm.texi.in
> > @@ -7958,6 +7958,8 @@ lists.
> >
> >  @hook TARGET_GEN_CCMP_NEXT
> >
> > +@hook TARGET_GEN_MEMSET_SCRATCH_RTX
> > +
> >  @hook TARGET_LOOP_UNROLL_ADJUST
> >
> >  @defmac POWI_MAX_MULTS
> > diff --git a/gcc/expr.c b/gcc/expr.c
> > index 6a4368113c4..92dbf47194e 100644
> > --- a/gcc/expr.c
> > +++ b/gcc/expr.c
> > @@ -769,15 +769,36 @@ alignment_for_piecewise_move (unsigned int 
> > max_pieces, unsigned int align)
> >    return align;
> >  }
> >
> > -/* Return the widest integer mode that is narrower than SIZE bytes.  */
> > +/* Return the widest integer or QI vector mode that is narrower than
> > +   SIZE bytes.  */
> >
> > -static scalar_int_mode
> > -widest_int_mode_for_size (unsigned int size)
> > +static machine_mode
> > +widest_int_or_QI_vector_mode_for_size (unsigned int size,
> > +                                    bool qi_vector)
> >  {
> > -  scalar_int_mode result = NARROWEST_INT_MODE;
> > +  machine_mode result = NARROWEST_INT_MODE;
> >
> >    gcc_checking_assert (size > 1);
> >
> > +  /* Use QI vector only if size is wider than a WORD.  */
> > +  if (qi_vector && size > GET_MODE_SIZE (word_mode))
> > +    {
> > +      machine_mode mode;
> > +      FOR_EACH_MODE_IN_CLASS (mode, MODE_VECTOR_INT)
> > +     if (GET_MODE_INNER (mode) == QImode
> > +         && GET_MODE_SIZE (mode).is_constant ())
> > +       {
> > +         if (GET_MODE_SIZE (mode).to_constant () >= size)
> > +           break;
> > +         if (optab_handler (vec_duplicate_optab, mode)
> > +             != CODE_FOR_nothing)
> > +           result = mode;
> > +       }
> > +
> > +      if (result != NARROWEST_INT_MODE)
> > +     return result;
> > +    }
> > +
> >    opt_scalar_int_mode tmode;
> >    FOR_EACH_MODE_IN_CLASS (tmode, MODE_INT)
> >      if (GET_MODE_SIZE (tmode.require ()) < size)
> > @@ -815,16 +836,18 @@ by_pieces_ninsns (unsigned HOST_WIDE_INT l, unsigned 
> > int align,
> >                 unsigned int max_size, by_pieces_operation op)
> >  {
> >    unsigned HOST_WIDE_INT n_insns = 0;
> > -  scalar_int_mode mode;
> > +  machine_mode mode;
> >
> >    if (targetm.overlap_op_by_pieces_p () && op != COMPARE_BY_PIECES)
> >      {
> >        /* NB: Round up L and ALIGN to the widest integer mode for
> >        MAX_SIZE.  */
> > -      mode = widest_int_mode_for_size (max_size);
> > +      mode = widest_int_or_QI_vector_mode_for_size (max_size,
> > +                                                 op == SET_BY_PIECES);
> >        if (optab_handler (mov_optab, mode) != CODE_FOR_nothing)
> >       {
> > -       unsigned HOST_WIDE_INT up = ROUND_UP (l, GET_MODE_SIZE (mode));
> > +       unsigned HOST_WIDE_INT up = ROUND_UP (l,
> > +                                             GET_MODE_SIZE 
> > (mode).to_constant ());
> >         if (up > l)
> >           l = up;
> >         align = GET_MODE_ALIGNMENT (mode);
> > @@ -835,10 +858,11 @@ by_pieces_ninsns (unsigned HOST_WIDE_INT l, unsigned 
> > int align,
> >
> >    while (max_size > 1 && l > 0)
> >      {
> > -      mode = widest_int_mode_for_size (max_size);
> > +      mode = widest_int_or_QI_vector_mode_for_size (max_size,
> > +                                                 op == SET_BY_PIECES);
> >        enum insn_code icode;
> >
> > -      unsigned int modesize = GET_MODE_SIZE (mode);
> > +      unsigned int modesize = GET_MODE_SIZE (mode).to_constant ();
> >
> >        icode = optab_handler (mov_optab, mode);
> >        if (icode != CODE_FOR_nothing && align >= GET_MODE_ALIGNMENT (mode))
> > @@ -903,8 +927,7 @@ class pieces_addr
> >    void *m_cfndata;
> >  public:
> >    pieces_addr (rtx, bool, by_pieces_constfn, void *);
> > -  rtx adjust (scalar_int_mode, HOST_WIDE_INT,
> > -           by_pieces_prev * = nullptr);
> > +  rtx adjust (machine_mode, HOST_WIDE_INT, by_pieces_prev * = nullptr);
> >    void increment_address (HOST_WIDE_INT);
> >    void maybe_predec (HOST_WIDE_INT);
> >    void maybe_postinc (HOST_WIDE_INT);
> > @@ -1006,7 +1029,7 @@ pieces_addr::decide_autoinc (machine_mode ARG_UNUSED 
> > (mode), bool reverse,
> >     but we still modify the MEM's properties.  */
> >
> >  rtx
> > -pieces_addr::adjust (scalar_int_mode mode, HOST_WIDE_INT offset,
> > +pieces_addr::adjust (machine_mode mode, HOST_WIDE_INT offset,
> >                    by_pieces_prev *prev)
> >  {
> >    if (m_constfn)
> > @@ -1060,7 +1083,8 @@ pieces_addr::maybe_postinc (HOST_WIDE_INT size)
> >  class op_by_pieces_d
> >  {
> >   private:
> > -  scalar_int_mode get_usable_mode (scalar_int_mode mode, unsigned int);
> > +  machine_mode get_usable_mode (machine_mode, unsigned int);
> > +  machine_mode smallest_mode_for_size (unsigned int);
> >
> >   protected:
> >    pieces_addr m_to, m_from;
> > @@ -1073,6 +1097,8 @@ class op_by_pieces_d
> >    bool m_push;
> >    /* True if targetm.overlap_op_by_pieces_p () returns true.  */
> >    bool m_overlap_op_by_pieces;
> > +  /* True if QI vector mode can be used.  */
> > +  bool m_qi_vector_mode;
> >
> >    /* Virtual functions, overriden by derived classes for the specific
> >       operation.  */
> > @@ -1084,7 +1110,8 @@ class op_by_pieces_d
> >
> >   public:
> >    op_by_pieces_d (rtx, bool, rtx, bool, by_pieces_constfn, void *,
> > -               unsigned HOST_WIDE_INT, unsigned int, bool);
> > +               unsigned HOST_WIDE_INT, unsigned int, bool,
> > +               bool = false);
> >    void run ();
> >  };
> >
> > @@ -1099,11 +1126,12 @@ op_by_pieces_d::op_by_pieces_d (rtx to, bool 
> > to_load,
> >                               by_pieces_constfn from_cfn,
> >                               void *from_cfn_data,
> >                               unsigned HOST_WIDE_INT len,
> > -                             unsigned int align, bool push)
> > +                             unsigned int align, bool push,
> > +                             bool qi_vector_mode)
> >    : m_to (to, to_load, NULL, NULL),
> >      m_from (from, from_load, from_cfn, from_cfn_data),
> >      m_len (len), m_max_size (MOVE_MAX_PIECES + 1),
> > -    m_push (push)
> > +    m_push (push), m_qi_vector_mode (qi_vector_mode)
> >  {
> >    int toi = m_to.get_addr_inc ();
> >    int fromi = m_from.get_addr_inc ();
> > @@ -1124,7 +1152,9 @@ op_by_pieces_d::op_by_pieces_d (rtx to, bool to_load,
> >    if (by_pieces_ninsns (len, align, m_max_size, MOVE_BY_PIECES) > 2)
> >      {
> >        /* Find the mode of the largest comparison.  */
> > -      scalar_int_mode mode = widest_int_mode_for_size (m_max_size);
> > +      machine_mode mode
> > +     = widest_int_or_QI_vector_mode_for_size (m_max_size,
> > +                                              m_qi_vector_mode);
> >
> >        m_from.decide_autoinc (mode, m_reverse, len);
> >        m_to.decide_autoinc (mode, m_reverse, len);
> > @@ -1139,22 +1169,42 @@ op_by_pieces_d::op_by_pieces_d (rtx to, bool 
> > to_load,
> >  /* This function returns the largest usable integer mode for LEN bytes
> >     whose size is no bigger than size of MODE.  */
> >
> > -scalar_int_mode
> > -op_by_pieces_d::get_usable_mode (scalar_int_mode mode, unsigned int len)
> > +machine_mode
> > +op_by_pieces_d::get_usable_mode (machine_mode mode, unsigned int len)
> >  {
> >    unsigned int size;
> >    do
> >      {
> > -      size = GET_MODE_SIZE (mode);
> > +      size = GET_MODE_SIZE (mode).to_constant ();
> >        if (len >= size && prepare_mode (mode, m_align))
> >       break;
> > -      /* NB: widest_int_mode_for_size checks SIZE > 1.  */
> > -      mode = widest_int_mode_for_size (size);
> > +      /* NB: widest_int_or_QI_vector_mode_for_size checks SIZE > 1.  */
> > +      mode = widest_int_or_QI_vector_mode_for_size (size,
> > +                                                 m_qi_vector_mode);
> >      }
> >    while (1);
> >    return mode;
> >  }
> >
> > +/* Return the smallest integer or QI vector mode that is not narrower
> > +   than SIZE bits.  */
> > +
> > +machine_mode
> > +op_by_pieces_d::smallest_mode_for_size (unsigned int size)
> > +{
> > +  /* Use QI vector only for > size of WORD.  */
> > +  if (m_qi_vector_mode && size > GET_MODE_PRECISION (word_mode))
> > +    {
> > +      machine_mode mode;
> > +      FOR_EACH_MODE_IN_CLASS (mode, MODE_VECTOR_INT)
> > +     if (GET_MODE_INNER (mode) == QImode
> > +         && known_ge (GET_MODE_PRECISION (mode), size))
> > +       return mode;
>
> Shouldn't this also test for vec_duplicate_optab?  I guess you're
> hoping that the previous register will be reused via a lowpart,
> but since that's up to the callbacks, I think we should be
> conservative here.

Fixed.   I will submit the v2 patch.

Thanks.

> Alternatively, we could bypass the callbacks when this code is used.
>
> Thanks,
> Richard
>
> > +    }
> > +
> > +  return smallest_int_mode_for_size (size);
> > +}
> > +
> >  /* This function contains the main loop used for expanding a block
> >     operation.  First move what we can in the largest integer mode,
> >     then go to successively smaller modes.  For every access, call
> > @@ -1166,8 +1216,10 @@ op_by_pieces_d::run ()
> >    if (m_len == 0)
> >      return;
> >
> > -  /* NB: widest_int_mode_for_size checks M_MAX_SIZE > 1.  */
> > -  scalar_int_mode mode = widest_int_mode_for_size (m_max_size);
> > +  /* NB: widest_int_or_QI_vector_mode_for_size checks M_MAX_SIZE > 1.  */
> > +  machine_mode mode
> > +    = widest_int_or_QI_vector_mode_for_size (m_max_size,
> > +                                          m_qi_vector_mode);
> >    mode = get_usable_mode (mode, m_len);
> >
> >    by_pieces_prev to_prev = { nullptr, mode };
> > @@ -1175,7 +1227,7 @@ op_by_pieces_d::run ()
> >
> >    do
> >      {
> > -      unsigned int size = GET_MODE_SIZE (mode);
> > +      unsigned int size = GET_MODE_SIZE (mode).to_constant ();
> >        rtx to1 = NULL_RTX, from1;
> >
> >        while (m_len >= size)
> > @@ -1214,9 +1266,10 @@ op_by_pieces_d::run ()
> >         /* NB: Generate overlapping operations if it is not a stack
> >            push since stack push must not overlap.  Get the smallest
> >            integer mode for M_LEN bytes.  */
> > -       mode = smallest_int_mode_for_size (m_len * BITS_PER_UNIT);
> > -       mode = get_usable_mode (mode, GET_MODE_SIZE (mode));
> > -       int gap = GET_MODE_SIZE (mode) - m_len;
> > +       mode = smallest_mode_for_size (m_len * BITS_PER_UNIT);
> > +       unsigned int mode_size = GET_MODE_SIZE (mode).to_constant ();
> > +       mode = get_usable_mode (mode, mode_size);
> > +       int gap = mode_size - m_len;
> >         if (gap > 0)
> >           {
> >             /* If size of MODE > M_LEN, generate the last operation
> > @@ -1231,8 +1284,9 @@ op_by_pieces_d::run ()
> >       }
> >        else
> >       {
> > -       /* NB: widest_int_mode_for_size checks SIZE > 1.  */
> > -       mode = widest_int_mode_for_size (size);
> > +       /* NB: widest_int_or_QI_vector_mode_for_size checks SIZE > 1.  */
> > +       mode = widest_int_or_QI_vector_mode_for_size (size,
> > +                                                     m_qi_vector_mode);
> >         mode = get_usable_mode (mode, m_len);
> >       }
> >      }
> > @@ -1355,9 +1409,10 @@ class store_by_pieces_d : public op_by_pieces_d
> >
> >   public:
> >    store_by_pieces_d (rtx to, by_pieces_constfn cfn, void *cfn_data,
> > -                  unsigned HOST_WIDE_INT len, unsigned int align)
> > +                  unsigned HOST_WIDE_INT len, unsigned int align,
> > +                  bool qi_vector_mode = false)
> >      : op_by_pieces_d (to, false, NULL_RTX, true, cfn, cfn_data, len,
> > -                   align, false)
> > +                   align, false, qi_vector_mode)
> >    {
> >    }
> >    rtx finish_retmode (memop_ret);
> > @@ -1446,13 +1501,14 @@ can_store_by_pieces (unsigned HOST_WIDE_INT len,
> >        max_size = STORE_MAX_PIECES + 1;
> >        while (max_size > 1 && l > 0)
> >       {
> > -       scalar_int_mode mode = widest_int_mode_for_size (max_size);
> > +       machine_mode mode
> > +         = widest_int_or_QI_vector_mode_for_size (max_size, memsetp);
> >
> >         icode = optab_handler (mov_optab, mode);
> >         if (icode != CODE_FOR_nothing
> >             && align >= GET_MODE_ALIGNMENT (mode))
> >           {
> > -           unsigned int size = GET_MODE_SIZE (mode);
> > +           unsigned int size = GET_MODE_SIZE (mode).to_constant ();
> >
> >             while (l >= size)
> >               {
> > @@ -1470,7 +1526,7 @@ can_store_by_pieces (unsigned HOST_WIDE_INT len,
> >               }
> >           }
> >
> > -       max_size = GET_MODE_SIZE (mode);
> > +       max_size = GET_MODE_SIZE (mode).to_constant ();
> >       }
> >
> >        /* The code above should have handled everything.  */
> > @@ -1504,7 +1560,8 @@ store_by_pieces (rtx to, unsigned HOST_WIDE_INT len,
> >                memsetp ? SET_BY_PIECES : STORE_BY_PIECES,
> >                optimize_insn_for_speed_p ()));
> >
> > -  store_by_pieces_d data (to, constfun, constfundata, len, align);
> > +  store_by_pieces_d data (to, constfun, constfundata, len, align,
> > +                       memsetp);
> >    data.run ();
> >
> >    if (retmode != RETURN_BEGIN)
> > @@ -1517,7 +1574,7 @@ store_by_pieces (rtx to, unsigned HOST_WIDE_INT len,
> >     Return const0_rtx unconditionally.  */
> >
> >  static rtx
> > -clear_by_pieces_1 (void *, void *, HOST_WIDE_INT, scalar_int_mode)
> > +clear_by_pieces_1 (void *, void *, HOST_WIDE_INT, machine_mode)
> >  {
> >    return const0_rtx;
> >  }
> > @@ -5754,7 +5811,7 @@ emit_storent_insn (rtx to, rtx from)
> >
> >  static rtx
> >  string_cst_read_str (void *data, void *, HOST_WIDE_INT offset,
> > -                  scalar_int_mode mode)
> > +                  machine_mode mode)
> >  {
> >    tree str = (tree) data;
> >
> > @@ -5762,17 +5819,19 @@ string_cst_read_str (void *data, void *, 
> > HOST_WIDE_INT offset,
> >    if (offset >= TREE_STRING_LENGTH (str))
> >      return const0_rtx;
> >
> > -  if ((unsigned HOST_WIDE_INT) offset + GET_MODE_SIZE (mode)
> > +  unsigned int size = GET_MODE_SIZE (mode).to_constant ();
> > +  if ((unsigned HOST_WIDE_INT) offset + size
> >        > (unsigned HOST_WIDE_INT) TREE_STRING_LENGTH (str))
> >      {
> > -      char *p = XALLOCAVEC (char, GET_MODE_SIZE (mode));
> > +      char *p = XALLOCAVEC (char, size);
> >        size_t l = TREE_STRING_LENGTH (str) - offset;
> >        memcpy (p, TREE_STRING_POINTER (str) + offset, l);
> > -      memset (p + l, '\0', GET_MODE_SIZE (mode) - l);
> > -      return c_readstr (p, mode, false);
> > +      memset (p + l, '\0', size - l);
> > +      return c_readstr (p, as_a <scalar_int_mode> (mode), false);
> >      }
> >
> > -  return c_readstr (TREE_STRING_POINTER (str) + offset, mode, false);
> > +  return c_readstr (TREE_STRING_POINTER (str) + offset,
> > +                 as_a <scalar_int_mode> (mode), false);
> >  }
> >
> >  /* Generate code for computing expression EXP,
> > diff --git a/gcc/expr.h b/gcc/expr.h
> > index a4f44265759..552645e4403 100644
> > --- a/gcc/expr.h
> > +++ b/gcc/expr.h
> > @@ -108,13 +108,13 @@ enum block_op_methods
> >  };
> >
> >  typedef rtx (*by_pieces_constfn) (void *, void *, HOST_WIDE_INT,
> > -                               scalar_int_mode);
> > +                               machine_mode);
> >
> >  /* The second pointer passed to by_pieces_constfn.  */
> >  struct by_pieces_prev
> >  {
> >    rtx data;
> > -  scalar_int_mode mode;
> > +  machine_mode mode;
> >  };
> >
> >  extern rtx emit_block_move (rtx, rtx, rtx, enum block_op_methods);
> > diff --git a/gcc/target.def b/gcc/target.def
> > index 2e40448e6c5..cc4aa3a4212 100644
> > --- a/gcc/target.def
> > +++ b/gcc/target.def
> > @@ -2726,6 +2726,13 @@ DEFHOOK
> >   rtx, (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev, int cmp_code, 
> > tree op0, tree op1, int bit_code),
> >   NULL)
> >
> > +DEFHOOK
> > +(gen_memset_scratch_rtx,
> > + "This hook should return an rtx for scratch register in @var{mode} to\n\
> > +be used by memset broadcast. The default is @code{gen_reg_rtx}.",
> > + rtx, (machine_mode mode),
> > + gen_reg_rtx)
> > +
> >  /* Return a new value for loop unroll size.  */
> >  DEFHOOK
> >  (loop_unroll_adjust,
> > diff --git a/gcc/testsuite/gcc.target/i386/pr100865-3.c 
> > b/gcc/testsuite/gcc.target/i386/pr100865-3.c
> > index b6dbcf7809b..007e79f91b0 100644
> > --- a/gcc/testsuite/gcc.target/i386/pr100865-3.c
> > +++ b/gcc/testsuite/gcc.target/i386/pr100865-3.c
> > @@ -10,6 +10,6 @@ foo (void)
> >  }
> >
> >  /* { dg-final { scan-assembler-times "vpbroadcastb\[\\t 
> > \]+%(?:r|e)\[^\n\]*, %xmm\[0-9\]+" 1 } } */
> > -/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%xmm\[0-9\]+, 
> > \\(%\[\^,\]+\\)" 1 } } */
> > +/* { dg-final { scan-assembler-times "vmovdqu8\[\\t \]%xmm\[0-9\]+, 
> > \\(%\[\^,\]+\\)" 1 } } */
> >  /* { dg-final { scan-assembler-not "vpbroadcastb\[\\t \]+%xmm\[0-9\]+, 
> > %xmm\[0-9\]+" } } */
> >  /* { dg-final { scan-assembler-not "vmovdqa" } } */
> > diff --git a/gcc/testsuite/gcc.target/i386/pr100865-4b.c 
> > b/gcc/testsuite/gcc.target/i386/pr100865-4b.c
> > index f41e6147b4c..1e50dc842bc 100644
> > --- a/gcc/testsuite/gcc.target/i386/pr100865-4b.c
> > +++ b/gcc/testsuite/gcc.target/i386/pr100865-4b.c
> > @@ -4,6 +4,6 @@
> >  #include "pr100865-4a.c"
> >
> >  /* { dg-final { scan-assembler-times "vpbroadcastb\[\\t 
> > \]+%(?:r|e)\[^\n\]*, %xmm\[0-9\]+" 1 } } */
> > -/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%xmm\[0-9\]+, " 4 } } 
> > */
> > +/* { dg-final { scan-assembler-times "vmovdqu8\[\\t \]%xmm\[0-9\]+, " 4 } 
> > } */
> >  /* { dg-final { scan-assembler-not "vpbroadcastb\[\\t \]+%xmm\[0-9\]+, 
> > %xmm\[0-9\]+" } } */
> >  /* { dg-final { scan-assembler-not "vmovdqa" } } */



-- 
H.J.

Reply via email to