On Fri, Jul 16, 2021 at 7:15 AM H.J. Lu <hjl.to...@gmail.com> wrote:
>
> On Fri, Jul 16, 2021 at 6:24 AM Richard Sandiford
> <richard.sandif...@arm.com> wrote:
> >
> > "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:
> > > On Fri, Jul 16, 2021 at 4:38 AM Richard Sandiford
> > > <richard.sandif...@arm.com> wrote:
> > >>
> > >> "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:
> > >> > 1. Rewrite builtin_memset_read_str and builtin_memset_gen_str with
> > >> > vec_duplicate_optab to duplicate QI value to TI/OI/XI value.
> > >> > 2. Add TARGET_GEN_MEMSET_SCRATCH_RTX to allow the backend to use a hard
> > >> > scratch register to avoid stack realignment when expanding memset.
> > >> >
> > >> >       PR middle-end/90773
> > >> >       * builtins.c (gen_memset_value_from_prev): New function.
> > >> >       (gen_memset_broadcast): Likewise.
> > >> >       (builtin_memset_read_str): Use gen_memset_value_from_prev
> > >> >       and gen_memset_broadcast.
> > >> >       (builtin_memset_gen_str): Likewise.
> > >> >       * target.def (gen_memset_scratch_rtx): New hook.
> > >> >       * doc/tm.texi.in: Add TARGET_GEN_MEMSET_SCRATCH_RTX.
> > >> >       * doc/tm.texi: Regenerated.
> > >> > ---
> > >> >  gcc/builtins.c     | 123 +++++++++++++++++++++++++++++++++++++--------
> > >> >  gcc/doc/tm.texi    |   5 ++
> > >> >  gcc/doc/tm.texi.in |   2 +
> > >> >  gcc/target.def     |   7 +++
> > >> >  4 files changed, 116 insertions(+), 21 deletions(-)
> > >> >
> > >> > diff --git a/gcc/builtins.c b/gcc/builtins.c
> > >> > index 39ab139b7e1..c1758ae2efc 100644
> > >> > --- a/gcc/builtins.c
> > >> > +++ b/gcc/builtins.c
> > >> > @@ -6686,26 +6686,111 @@ expand_builtin_strncpy (tree exp, rtx target)
> > >> >    return NULL_RTX;
> > >> >  }
> > >> >
> > >> > -/* Callback routine for store_by_pieces.  Read GET_MODE_BITSIZE (MODE)
> > >> > -   bytes from constant string DATA + OFFSET and return it as target
> > >> > -   constant.  If PREV isn't nullptr, it has the RTL info from the
> > >> > +/* Return the RTL of a register in MODE generated from PREV in the
> > >> >     previous iteration.  */
> > >> >
> > >> > -rtx
> > >> > -builtin_memset_read_str (void *data, void *prevp,
> > >> > -                      HOST_WIDE_INT offset ATTRIBUTE_UNUSED,
> > >> > -                      scalar_int_mode mode)
> > >> > +static rtx
> > >> > +gen_memset_value_from_prev (void *prevp, scalar_int_mode mode)
> > >> >  {
> > >> > +  rtx target = nullptr;
> > >> >    by_pieces_prev *prev = (by_pieces_prev *) prevp;
> > >> >    if (prev != nullptr && prev->data != nullptr)
> > >> >      {
> > >> >        /* Use the previous data in the same mode.  */
> > >> >        if (prev->mode == mode)
> > >> >       return prev->data;
> > >> > +
> > >> > +      rtx prev_rtx = prev->data;
> > >> > +      machine_mode prev_mode = prev->mode;
> > >> > +      unsigned int word_size = GET_MODE_SIZE (word_mode);
> > >> > +      if (word_size < GET_MODE_SIZE (prev->mode)
> > >> > +       && word_size > GET_MODE_SIZE (mode))
> > >> > +     {
> > >> > +       /* First generate subreg of word mode if the previous mode is
> > >> > +          wider than word mode and word mode is wider than MODE.  */
> > >> > +       prev_rtx = simplify_gen_subreg (word_mode, prev_rtx,
> > >> > +                                       prev_mode, 0);
> > >> > +       prev_mode = word_mode;
> > >> > +     }
> > >> > +      if (prev_rtx != nullptr)
> > >> > +     target = simplify_gen_subreg (mode, prev_rtx, prev_mode, 0);
> > >> >      }
> > >> > +  return target;
> > >> > +}
> > >> > +
> > >> > +/* Return the RTL of a register in MODE broadcasted from DATA.  */
> > >> > +
> > >> > +static rtx
> > >> > +gen_memset_broadcast (rtx data, scalar_int_mode mode)
> > >> > +{
> > >> > +  /* Skip if regno_reg_rtx isn't initialized.  */
> > >> > +  if (!regno_reg_rtx)
> > >> > +    return nullptr;
> > >> > +
> > >> > +  rtx target = nullptr;
> > >> > +
> > >> > +  unsigned int nunits = GET_MODE_SIZE (mode) / GET_MODE_SIZE (QImode);
> > >> > +  machine_mode vector_mode;
> > >> > +  if (!mode_for_vector (QImode, nunits).exists (&vector_mode))
> > >> > +    gcc_unreachable ();
> > >>
> > >> Sorry, I realise it's a bit late to be raising this objection now,
> > >> but I don't think it's a good idea to use scalar integer modes as
> > >> a proxy for vector modes.  In principle there's no reason why a
> > >> target has to define an integer mode for every vector mode.
> > >
> > > A target always defines the largest integer mode.
> >
> > Right.  But a target shouldn't *need* to define an integer mode
> > for every vector mode.
> >
> > >> If we want the mode to be a vector then I think the by-pieces
> > >> infrastructure should be extended to support vectors directly,
> > >> rather than assuming that each piece can be represented as
> > >> a scalar_int_mode.
> > >>
> > >
> > > The current by-pieces infrastructure operates on scalar_int_mode.
> > > Only for memset, there is
> > >
> > > /* Callback routine for store_by_pieces.  Return the RTL of a register
> > >    containing GET_MODE_SIZE (MODE) consecutive copies of the unsigned
> > >    char value given in the RTL register data.  For example, if mode is
> > >    4 bytes wide, return the RTL for 0x01010101*data.  If PREV isn't
> > >    nullptr, it has the RTL info from the previous iteration.  */
> > >
> > > static rtx
> > > builtin_memset_gen_str (void *data, void *prevp,
> > >                         HOST_WIDE_INT offset ATTRIBUTE_UNUSED,
> > >                         scalar_int_mode mode)
> > >
> > > It is a broadcast.  If a target can broadcast a byte to a wider integer,
> > > can you suggest a way to use it in the current by-pieces infrastructure?
> >
> > I meant that we should extend the by-pieces infrastructure so that
> > it no longer requires scalar_int_mode, rather than work within the
> > current limits.
> >
> > IMO the fact that we're (legitimately) going through vec_duplicate_optab
>
> If vec_duplicate_optab is the only blocking issue, can we add an integer
> broadcast optab?  I want to avoid a major surgery just because we
> don't support integer broadcast from byte.
>
> > shows that we really are dealing with vectors here, not integers.
> >
>
> Are you suggesting we should add the optional vector mode
> support to by-pieces for memset?
>

Like this?

1. Replace scalar_int_mode with machine_mode in the by-pieces
infrastructure to allow non-integer mode.
2. Rename widest_int_mode_for_size to widest_int_or_QI_vector_mode_for_size
to return QI vector mode for memset.
3. Add op_by_pieces_d::smallest_mode_for_size to return the smallest
integer or QI vector mode.

Thanks.

-- 
H.J.
From 171f0a919300a8aac58dfe5fc9904eaa8fe0a05b Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.to...@gmail.com>
Date: Sun, 6 Mar 2016 06:38:21 -0800
Subject: [PATCH] Add QI vector mode support to by-pieces for memset

1. Replace scalar_int_mode with machine_mode in the by-pieces
infrastructure to allow non-integer mode.
2. Rename widest_int_mode_for_size to widest_int_or_QI_vector_mode_for_size
to return QI vector mode for memset.
3. Add op_by_pieces_d::smallest_mode_for_size to return the smallest
integer or QI vector mode.

gcc/

	PR middle-end/90773
	* builtins.c (builtin_memcpy_read_str): Change the mode argument
	from scalar_int_mode to machine_mode.
	(builtin_strncpy_read_str): Likewise.
	(gen_memset_value_from_prev): New function.
	(gen_memset_broadcast): Likewise.
	(builtin_memset_read_str): Change the mode argument from
	scalar_int_mode to machine_mode.  Use gen_memset_value_from_prev
	and gen_memset_broadcast.
	(builtin_memset_gen_str): Likewise.
	(try_store_by_multiple_pieces): Use by_pieces_constfn to declare
	constfun.
	* builtins.h (builtin_strncpy_read_str): Updated.
	(builtin_memset_read_str): Likewise.
	* expr.c (widest_int_mode_for_size): Renamed to ...
	(widest_int_or_QI_vector_mode_for_size): Add a bool argument to
	indicate if QI vector mode can be used.
	(by_pieces_ninsns): Call widest_int_or_QI_vector_mode_for_size
	instead of widest_int_mode_for_size.
	(pieces_addr::adjust): Change the mode argument from
	scalar_int_mode to machine_mode.
	(op_by_pieces_d): Add a bool member, m_qi_vector_mode, to
	indicate that QI vector mode can be used.
	(op_by_pieces_d::op_by_pieces_d): Add a bool argument to initialize
	m_qi_vector_mode.  Call widest_int_or_QI_vector_mode_for_size
	instead of widest_int_mode_for_size.
	(op_by_pieces_d::get_usable_mode): Change the mode argument from
	scalar_int_mode to machine_mode.  Call
	widest_int_or_QI_vector_mode_for_size instead of
	widest_int_mode_for_size.
	(op_by_pieces_d::smallest_mode_for_size): New member function
	to return the smallest integer or QI vector mode.
	(op_by_pieces_d::run): Call widest_int_or_QI_vector_mode_for_size
	instead of widest_int_mode_for_size.  Call smallest_mode_for_size
	instead of smallest_int_mode_for_size.
	(store_by_pieces_d::store_by_pieces_d): Add a bool argument to
	indicate that QI vector mode can be used and pass it to
	op_by_pieces_d::op_by_pieces_d.
	(can_store_by_pieces): Call widest_int_or_QI_vector_mode_for_size
	instead of widest_int_mode_for_size.
	(store_by_pieces::store_by_pieces): Pass memsetp to
	store_by_pieces_d::store_by_pieces_d.
	(clear_by_pieces_1): Change scalar_int_mode to machine_mode.
	(string_cst_read_str): Change the mode argument from
	scalar_int_mode to machine_mode.
	* expr.h (by_pieces_constfn): Change scalar_int_mode to
	machine_mode.
	(by_pieces_prev): Likewise.
	* target.def (gen_memset_scratch_rtx): New hook.
	* doc/tm.texi.in: Add TARGET_GEN_MEMSET_SCRATCH_RTX.
	* doc/tm.texi: Regenerated.

gcc/testsuite/

	* gcc.target/i386/pr100865-3.c: Expect vmovdqu8 instead of
	vmovdqu.
	* gcc.target/i386/pr100865-4b.c: Likewise.
---
 gcc/builtins.c                              | 138 ++++++++++++++-----
 gcc/builtins.h                              |   4 +-
 gcc/doc/tm.texi                             |   5 +
 gcc/doc/tm.texi.in                          |   2 +
 gcc/expr.c                                  | 145 ++++++++++++++------
 gcc/expr.h                                  |   4 +-
 gcc/target.def                              |   7 +
 gcc/testsuite/gcc.target/i386/pr100865-3.c  |   2 +-
 gcc/testsuite/gcc.target/i386/pr100865-4b.c |   2 +-
 9 files changed, 229 insertions(+), 80 deletions(-)

diff --git a/gcc/builtins.c b/gcc/builtins.c
index 39ab139b7e1..876378b467c 100644
--- a/gcc/builtins.c
+++ b/gcc/builtins.c
@@ -3890,13 +3890,14 @@ expand_builtin_strnlen (tree exp, rtx target, machine_mode target_mode)
 
 static rtx
 builtin_memcpy_read_str (void *data, void *, HOST_WIDE_INT offset,
-			 scalar_int_mode mode)
+			 machine_mode mode)
 {
   /* The REPresentation pointed to by DATA need not be a nul-terminated
      string but the caller guarantees it's large enough for MODE.  */
   const char *rep = (const char *) data;
 
-  return c_readstr (rep + offset, mode, /*nul_terminated=*/false);
+  return c_readstr (rep + offset, as_a <scalar_int_mode> (mode),
+		    /*nul_terminated=*/false);
 }
 
 /* LEN specify length of the block of memcpy/memset operation.
@@ -6478,14 +6479,14 @@ expand_builtin_stpncpy (tree exp, rtx)
 
 rtx
 builtin_strncpy_read_str (void *data, void *, HOST_WIDE_INT offset,
-			  scalar_int_mode mode)
+			  machine_mode mode)
 {
   const char *str = (const char *) data;
 
   if ((unsigned HOST_WIDE_INT) offset > strlen (str))
     return const0_rtx;
 
-  return c_readstr (str + offset, mode);
+  return c_readstr (str + offset, as_a <scalar_int_mode> (mode));
 }
 
 /* Helper to check the sizes of sequences and the destination of calls
@@ -6686,30 +6687,109 @@ expand_builtin_strncpy (tree exp, rtx target)
   return NULL_RTX;
 }
 
-/* Callback routine for store_by_pieces.  Read GET_MODE_BITSIZE (MODE)
-   bytes from constant string DATA + OFFSET and return it as target
-   constant.  If PREV isn't nullptr, it has the RTL info from the
+/* Return the RTL of a register in MODE generated from PREV in the
    previous iteration.  */
 
-rtx
-builtin_memset_read_str (void *data, void *prevp,
-			 HOST_WIDE_INT offset ATTRIBUTE_UNUSED,
-			 scalar_int_mode mode)
+static rtx
+gen_memset_value_from_prev (void *prevp, machine_mode mode)
 {
+  rtx target = nullptr;
   by_pieces_prev *prev = (by_pieces_prev *) prevp;
   if (prev != nullptr && prev->data != nullptr)
     {
       /* Use the previous data in the same mode.  */
       if (prev->mode == mode)
 	return prev->data;
+
+      rtx prev_rtx = prev->data;
+      machine_mode prev_mode = prev->mode;
+      unsigned int word_size = GET_MODE_SIZE (word_mode);
+      if (word_size < GET_MODE_SIZE (prev->mode).to_constant ()
+	  && word_size > GET_MODE_SIZE (mode).to_constant ())
+	{
+	  /* First generate subreg of word mode if the previous mode is
+	     wider than word mode and word mode is wider than MODE.  */
+	  prev_rtx = simplify_gen_subreg (word_mode, prev_rtx,
+					  prev_mode, 0);
+	  prev_mode = word_mode;
+	}
+      if (prev_rtx != nullptr)
+	target = simplify_gen_subreg (mode, prev_rtx, prev_mode, 0);
+    }
+  return target;
+}
+
+/* Return the RTL of a register in MODE broadcasted from DATA.  */
+
+static rtx
+gen_memset_broadcast (rtx data, machine_mode mode)
+{
+  /* Skip if regno_reg_rtx isn't initialized or not vector mode.  */
+  if (!regno_reg_rtx || !VECTOR_MODE_P (mode))
+    return nullptr;
+
+  gcc_assert (GET_MODE_INNER (mode) == QImode);
+
+  enum insn_code icode = optab_handler (vec_duplicate_optab, mode);
+
+  gcc_assert (icode != CODE_FOR_nothing);
+
+  rtx target = targetm.gen_memset_scratch_rtx (mode);
+  if (CONST_INT_P (data))
+    {
+      /* Use the move expander with CONST_VECTOR.  */
+      rtx const_vec = gen_const_vec_duplicate (mode, data);
+      emit_move_insn (target, const_vec);
+    }
+  else
+    {
+
+      class expand_operand ops[2];
+      create_output_operand (&ops[0], target, mode);
+      create_input_operand (&ops[1], data, QImode);
+      expand_insn (icode, 2, ops);
+      if (!rtx_equal_p (target, ops[0].value))
+	emit_move_insn (target, ops[0].value);
     }
 
+  return target;
+}
+
+/* Callback routine for store_by_pieces.  Read GET_MODE_BITSIZE (MODE)
+   bytes from constant string DATA + OFFSET and return it as target
+   constant.  If PREV isn't nullptr, it has the RTL info from the
+   previous iteration.  */
+
+rtx
+builtin_memset_read_str (void *data, void *prev,
+			 HOST_WIDE_INT offset ATTRIBUTE_UNUSED,
+			 machine_mode mode)
+{
+  rtx target;
   const char *c = (const char *) data;
-  char *p = XALLOCAVEC (char, GET_MODE_SIZE (mode));
+  char *p;
+  unsigned int size = GET_MODE_SIZE (mode).to_constant ();
+
+  /* Don't use the previous value if size is 1.  */
+  if (size != 1)
+    {
+      target = gen_memset_value_from_prev (prev, mode);
+      if (target != nullptr)
+	return target;
+
+      p = XALLOCAVEC (char, GET_MODE_SIZE (QImode));
+      memset (p, *c, GET_MODE_SIZE (QImode));
+      rtx src = c_readstr (p, QImode);
+      target = gen_memset_broadcast (src, mode);
+      if (target != nullptr)
+	return target;
+    }
+
+  p = XALLOCAVEC (char, size);
 
-  memset (p, *c, GET_MODE_SIZE (mode));
+  memset (p, *c, size);
 
-  return c_readstr (p, mode);
+  return c_readstr (p, as_a <scalar_int_mode> (mode));
 }
 
 /* Callback routine for store_by_pieces.  Return the RTL of a register
@@ -6719,33 +6799,29 @@ builtin_memset_read_str (void *data, void *prevp,
    nullptr, it has the RTL info from the previous iteration.  */
 
 static rtx
-builtin_memset_gen_str (void *data, void *prevp,
+builtin_memset_gen_str (void *data, void *prev,
 			HOST_WIDE_INT offset ATTRIBUTE_UNUSED,
-			scalar_int_mode mode)
+			machine_mode mode)
 {
   rtx target, coeff;
   size_t size;
   char *p;
 
-  by_pieces_prev *prev = (by_pieces_prev *) prevp;
-  if (prev != nullptr && prev->data != nullptr)
-    {
-      /* Use the previous data in the same mode.  */
-      if (prev->mode == mode)
-	return prev->data;
-
-      target = simplify_gen_subreg (mode, prev->data, prev->mode, 0);
-      if (target != nullptr)
-	return target;
-    }
-
-  size = GET_MODE_SIZE (mode);
+  size = GET_MODE_SIZE (mode).to_constant ();
   if (size == 1)
     return (rtx) data;
 
+  target = gen_memset_value_from_prev (prev, mode);
+  if (target != nullptr)
+    return target;
+
+  target = gen_memset_broadcast ((rtx) data, mode);
+  if (target != nullptr)
+    return target;
+
   p = XALLOCAVEC (char, size);
   memset (p, 1, size);
-  coeff = c_readstr (p, mode);
+  coeff = c_readstr (p, as_a <scalar_int_mode> (mode));
 
   target = convert_to_mode (mode, (rtx) data, 1);
   target = expand_mult (mode, target, coeff, NULL_RTX, 1);
@@ -6849,7 +6925,7 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
 			    &valc, align, true))
     return false;
 
-  rtx (*constfun) (void *, void *, HOST_WIDE_INT, scalar_int_mode);
+  by_pieces_constfn constfun;
   void *constfundata;
   if (val)
     {
diff --git a/gcc/builtins.h b/gcc/builtins.h
index a64ece3f1cd..31d07621750 100644
--- a/gcc/builtins.h
+++ b/gcc/builtins.h
@@ -111,9 +111,9 @@ extern tree mathfn_built_in (tree, enum built_in_function fn);
 extern tree mathfn_built_in (tree, combined_fn);
 extern tree mathfn_built_in_type (combined_fn);
 extern rtx builtin_strncpy_read_str (void *, void *, HOST_WIDE_INT,
-				     scalar_int_mode);
+				     machine_mode);
 extern rtx builtin_memset_read_str (void *, void *, HOST_WIDE_INT,
-				    scalar_int_mode);
+				    machine_mode);
 extern rtx expand_builtin_saveregs (void);
 extern tree std_build_builtin_va_list (void);
 extern tree std_fn_abi_va_list (tree);
diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index 3ad39443eba..c1995270720 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -12123,6 +12123,11 @@ This function prepares to emit a conditional comparison within a sequence
  @var{bit_code} is @code{AND} or @code{IOR}, which is the op on the compares.
 @end deftypefn
 
+@deftypefn {Target Hook} rtx TARGET_GEN_MEMSET_SCRATCH_RTX (machine_mode @var{mode})
+This hook should return an rtx for scratch register in @var{mode} to
+be used by memset broadcast. The default is @code{gen_reg_rtx}.
+@end deftypefn
+
 @deftypefn {Target Hook} unsigned TARGET_LOOP_UNROLL_ADJUST (unsigned @var{nunroll}, class loop *@var{loop})
 This target hook returns a new value for the number of times @var{loop}
 should be unrolled. The parameter @var{nunroll} is the number of times
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index f881cdabe9e..a6bbf4f2667 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -7958,6 +7958,8 @@ lists.
 
 @hook TARGET_GEN_CCMP_NEXT
 
+@hook TARGET_GEN_MEMSET_SCRATCH_RTX
+
 @hook TARGET_LOOP_UNROLL_ADJUST
 
 @defmac POWI_MAX_MULTS
diff --git a/gcc/expr.c b/gcc/expr.c
index 6a4368113c4..92dbf47194e 100644
--- a/gcc/expr.c
+++ b/gcc/expr.c
@@ -769,15 +769,36 @@ alignment_for_piecewise_move (unsigned int max_pieces, unsigned int align)
   return align;
 }
 
-/* Return the widest integer mode that is narrower than SIZE bytes.  */
+/* Return the widest integer or QI vector mode that is narrower than
+   SIZE bytes.  */
 
-static scalar_int_mode
-widest_int_mode_for_size (unsigned int size)
+static machine_mode
+widest_int_or_QI_vector_mode_for_size (unsigned int size,
+				       bool qi_vector)
 {
-  scalar_int_mode result = NARROWEST_INT_MODE;
+  machine_mode result = NARROWEST_INT_MODE;
 
   gcc_checking_assert (size > 1);
 
+  /* Use QI vector only if size is wider than a WORD.  */
+  if (qi_vector && size > GET_MODE_SIZE (word_mode))
+    {
+      machine_mode mode;
+      FOR_EACH_MODE_IN_CLASS (mode, MODE_VECTOR_INT)
+	if (GET_MODE_INNER (mode) == QImode
+	    && GET_MODE_SIZE (mode).is_constant ())
+	  {
+	    if (GET_MODE_SIZE (mode).to_constant () >= size)
+	      break;
+	    if (optab_handler (vec_duplicate_optab, mode)
+		!= CODE_FOR_nothing)
+	      result = mode;
+	  }
+
+      if (result != NARROWEST_INT_MODE)
+	return result;
+    }
+
   opt_scalar_int_mode tmode;
   FOR_EACH_MODE_IN_CLASS (tmode, MODE_INT)
     if (GET_MODE_SIZE (tmode.require ()) < size)
@@ -815,16 +836,18 @@ by_pieces_ninsns (unsigned HOST_WIDE_INT l, unsigned int align,
 		  unsigned int max_size, by_pieces_operation op)
 {
   unsigned HOST_WIDE_INT n_insns = 0;
-  scalar_int_mode mode;
+  machine_mode mode;
 
   if (targetm.overlap_op_by_pieces_p () && op != COMPARE_BY_PIECES)
     {
       /* NB: Round up L and ALIGN to the widest integer mode for
 	 MAX_SIZE.  */
-      mode = widest_int_mode_for_size (max_size);
+      mode = widest_int_or_QI_vector_mode_for_size (max_size,
+						    op == SET_BY_PIECES);
       if (optab_handler (mov_optab, mode) != CODE_FOR_nothing)
 	{
-	  unsigned HOST_WIDE_INT up = ROUND_UP (l, GET_MODE_SIZE (mode));
+	  unsigned HOST_WIDE_INT up = ROUND_UP (l,
+						GET_MODE_SIZE (mode).to_constant ());
 	  if (up > l)
 	    l = up;
 	  align = GET_MODE_ALIGNMENT (mode);
@@ -835,10 +858,11 @@ by_pieces_ninsns (unsigned HOST_WIDE_INT l, unsigned int align,
 
   while (max_size > 1 && l > 0)
     {
-      mode = widest_int_mode_for_size (max_size);
+      mode = widest_int_or_QI_vector_mode_for_size (max_size,
+						    op == SET_BY_PIECES);
       enum insn_code icode;
 
-      unsigned int modesize = GET_MODE_SIZE (mode);
+      unsigned int modesize = GET_MODE_SIZE (mode).to_constant ();
 
       icode = optab_handler (mov_optab, mode);
       if (icode != CODE_FOR_nothing && align >= GET_MODE_ALIGNMENT (mode))
@@ -903,8 +927,7 @@ class pieces_addr
   void *m_cfndata;
 public:
   pieces_addr (rtx, bool, by_pieces_constfn, void *);
-  rtx adjust (scalar_int_mode, HOST_WIDE_INT,
-	      by_pieces_prev * = nullptr);
+  rtx adjust (machine_mode, HOST_WIDE_INT, by_pieces_prev * = nullptr);
   void increment_address (HOST_WIDE_INT);
   void maybe_predec (HOST_WIDE_INT);
   void maybe_postinc (HOST_WIDE_INT);
@@ -1006,7 +1029,7 @@ pieces_addr::decide_autoinc (machine_mode ARG_UNUSED (mode), bool reverse,
    but we still modify the MEM's properties.  */
 
 rtx
-pieces_addr::adjust (scalar_int_mode mode, HOST_WIDE_INT offset,
+pieces_addr::adjust (machine_mode mode, HOST_WIDE_INT offset,
 		     by_pieces_prev *prev)
 {
   if (m_constfn)
@@ -1060,7 +1083,8 @@ pieces_addr::maybe_postinc (HOST_WIDE_INT size)
 class op_by_pieces_d
 {
  private:
-  scalar_int_mode get_usable_mode (scalar_int_mode mode, unsigned int);
+  machine_mode get_usable_mode (machine_mode, unsigned int);
+  machine_mode smallest_mode_for_size (unsigned int);
 
  protected:
   pieces_addr m_to, m_from;
@@ -1073,6 +1097,8 @@ class op_by_pieces_d
   bool m_push;
   /* True if targetm.overlap_op_by_pieces_p () returns true.  */
   bool m_overlap_op_by_pieces;
+  /* True if QI vector mode can be used.  */
+  bool m_qi_vector_mode;
 
   /* Virtual functions, overriden by derived classes for the specific
      operation.  */
@@ -1084,7 +1110,8 @@ class op_by_pieces_d
 
  public:
   op_by_pieces_d (rtx, bool, rtx, bool, by_pieces_constfn, void *,
-		  unsigned HOST_WIDE_INT, unsigned int, bool);
+		  unsigned HOST_WIDE_INT, unsigned int, bool,
+		  bool = false);
   void run ();
 };
 
@@ -1099,11 +1126,12 @@ op_by_pieces_d::op_by_pieces_d (rtx to, bool to_load,
 				by_pieces_constfn from_cfn,
 				void *from_cfn_data,
 				unsigned HOST_WIDE_INT len,
-				unsigned int align, bool push)
+				unsigned int align, bool push,
+				bool qi_vector_mode)
   : m_to (to, to_load, NULL, NULL),
     m_from (from, from_load, from_cfn, from_cfn_data),
     m_len (len), m_max_size (MOVE_MAX_PIECES + 1),
-    m_push (push)
+    m_push (push), m_qi_vector_mode (qi_vector_mode)
 {
   int toi = m_to.get_addr_inc ();
   int fromi = m_from.get_addr_inc ();
@@ -1124,7 +1152,9 @@ op_by_pieces_d::op_by_pieces_d (rtx to, bool to_load,
   if (by_pieces_ninsns (len, align, m_max_size, MOVE_BY_PIECES) > 2)
     {
       /* Find the mode of the largest comparison.  */
-      scalar_int_mode mode = widest_int_mode_for_size (m_max_size);
+      machine_mode mode
+	= widest_int_or_QI_vector_mode_for_size (m_max_size,
+						 m_qi_vector_mode);
 
       m_from.decide_autoinc (mode, m_reverse, len);
       m_to.decide_autoinc (mode, m_reverse, len);
@@ -1139,22 +1169,42 @@ op_by_pieces_d::op_by_pieces_d (rtx to, bool to_load,
 /* This function returns the largest usable integer mode for LEN bytes
    whose size is no bigger than size of MODE.  */
 
-scalar_int_mode
-op_by_pieces_d::get_usable_mode (scalar_int_mode mode, unsigned int len)
+machine_mode
+op_by_pieces_d::get_usable_mode (machine_mode mode, unsigned int len)
 {
   unsigned int size;
   do
     {
-      size = GET_MODE_SIZE (mode);
+      size = GET_MODE_SIZE (mode).to_constant ();
       if (len >= size && prepare_mode (mode, m_align))
 	break;
-      /* NB: widest_int_mode_for_size checks SIZE > 1.  */
-      mode = widest_int_mode_for_size (size);
+      /* NB: widest_int_or_QI_vector_mode_for_size checks SIZE > 1.  */
+      mode = widest_int_or_QI_vector_mode_for_size (size,
+						    m_qi_vector_mode);
     }
   while (1);
   return mode;
 }
 
+/* Return the smallest integer or QI vector mode that is not narrower
+   than SIZE bits.  */
+
+machine_mode
+op_by_pieces_d::smallest_mode_for_size (unsigned int size)
+{
+  /* Use QI vector only for > size of WORD.  */
+  if (m_qi_vector_mode && size > GET_MODE_PRECISION (word_mode))
+    {
+      machine_mode mode;
+      FOR_EACH_MODE_IN_CLASS (mode, MODE_VECTOR_INT)
+	if (GET_MODE_INNER (mode) == QImode
+	    && known_ge (GET_MODE_PRECISION (mode), size))
+	  return mode;
+    }
+
+  return smallest_int_mode_for_size (size);
+}
+
 /* This function contains the main loop used for expanding a block
    operation.  First move what we can in the largest integer mode,
    then go to successively smaller modes.  For every access, call
@@ -1166,8 +1216,10 @@ op_by_pieces_d::run ()
   if (m_len == 0)
     return;
 
-  /* NB: widest_int_mode_for_size checks M_MAX_SIZE > 1.  */
-  scalar_int_mode mode = widest_int_mode_for_size (m_max_size);
+  /* NB: widest_int_or_QI_vector_mode_for_size checks M_MAX_SIZE > 1.  */
+  machine_mode mode
+    = widest_int_or_QI_vector_mode_for_size (m_max_size,
+					     m_qi_vector_mode);
   mode = get_usable_mode (mode, m_len);
 
   by_pieces_prev to_prev = { nullptr, mode };
@@ -1175,7 +1227,7 @@ op_by_pieces_d::run ()
 
   do
     {
-      unsigned int size = GET_MODE_SIZE (mode);
+      unsigned int size = GET_MODE_SIZE (mode).to_constant ();
       rtx to1 = NULL_RTX, from1;
 
       while (m_len >= size)
@@ -1214,9 +1266,10 @@ op_by_pieces_d::run ()
 	  /* NB: Generate overlapping operations if it is not a stack
 	     push since stack push must not overlap.  Get the smallest
 	     integer mode for M_LEN bytes.  */
-	  mode = smallest_int_mode_for_size (m_len * BITS_PER_UNIT);
-	  mode = get_usable_mode (mode, GET_MODE_SIZE (mode));
-	  int gap = GET_MODE_SIZE (mode) - m_len;
+	  mode = smallest_mode_for_size (m_len * BITS_PER_UNIT);
+	  unsigned int mode_size = GET_MODE_SIZE (mode).to_constant ();
+	  mode = get_usable_mode (mode, mode_size);
+	  int gap = mode_size - m_len;
 	  if (gap > 0)
 	    {
 	      /* If size of MODE > M_LEN, generate the last operation
@@ -1231,8 +1284,9 @@ op_by_pieces_d::run ()
 	}
       else
 	{
-	  /* NB: widest_int_mode_for_size checks SIZE > 1.  */
-	  mode = widest_int_mode_for_size (size);
+	  /* NB: widest_int_or_QI_vector_mode_for_size checks SIZE > 1.  */
+	  mode = widest_int_or_QI_vector_mode_for_size (size,
+							m_qi_vector_mode);
 	  mode = get_usable_mode (mode, m_len);
 	}
     }
@@ -1355,9 +1409,10 @@ class store_by_pieces_d : public op_by_pieces_d
 
  public:
   store_by_pieces_d (rtx to, by_pieces_constfn cfn, void *cfn_data,
-		     unsigned HOST_WIDE_INT len, unsigned int align)
+		     unsigned HOST_WIDE_INT len, unsigned int align,
+		     bool qi_vector_mode = false)
     : op_by_pieces_d (to, false, NULL_RTX, true, cfn, cfn_data, len,
-		      align, false)
+		      align, false, qi_vector_mode)
   {
   }
   rtx finish_retmode (memop_ret);
@@ -1446,13 +1501,14 @@ can_store_by_pieces (unsigned HOST_WIDE_INT len,
       max_size = STORE_MAX_PIECES + 1;
       while (max_size > 1 && l > 0)
 	{
-	  scalar_int_mode mode = widest_int_mode_for_size (max_size);
+	  machine_mode mode
+	    = widest_int_or_QI_vector_mode_for_size (max_size, memsetp);
 
 	  icode = optab_handler (mov_optab, mode);
 	  if (icode != CODE_FOR_nothing
 	      && align >= GET_MODE_ALIGNMENT (mode))
 	    {
-	      unsigned int size = GET_MODE_SIZE (mode);
+	      unsigned int size = GET_MODE_SIZE (mode).to_constant ();
 
 	      while (l >= size)
 		{
@@ -1470,7 +1526,7 @@ can_store_by_pieces (unsigned HOST_WIDE_INT len,
 		}
 	    }
 
-	  max_size = GET_MODE_SIZE (mode);
+	  max_size = GET_MODE_SIZE (mode).to_constant ();
 	}
 
       /* The code above should have handled everything.  */
@@ -1504,7 +1560,8 @@ store_by_pieces (rtx to, unsigned HOST_WIDE_INT len,
 		 memsetp ? SET_BY_PIECES : STORE_BY_PIECES,
 		 optimize_insn_for_speed_p ()));
 
-  store_by_pieces_d data (to, constfun, constfundata, len, align);
+  store_by_pieces_d data (to, constfun, constfundata, len, align,
+			  memsetp);
   data.run ();
 
   if (retmode != RETURN_BEGIN)
@@ -1517,7 +1574,7 @@ store_by_pieces (rtx to, unsigned HOST_WIDE_INT len,
    Return const0_rtx unconditionally.  */
 
 static rtx
-clear_by_pieces_1 (void *, void *, HOST_WIDE_INT, scalar_int_mode)
+clear_by_pieces_1 (void *, void *, HOST_WIDE_INT, machine_mode)
 {
   return const0_rtx;
 }
@@ -5754,7 +5811,7 @@ emit_storent_insn (rtx to, rtx from)
 
 static rtx
 string_cst_read_str (void *data, void *, HOST_WIDE_INT offset,
-		     scalar_int_mode mode)
+		     machine_mode mode)
 {
   tree str = (tree) data;
 
@@ -5762,17 +5819,19 @@ string_cst_read_str (void *data, void *, HOST_WIDE_INT offset,
   if (offset >= TREE_STRING_LENGTH (str))
     return const0_rtx;
 
-  if ((unsigned HOST_WIDE_INT) offset + GET_MODE_SIZE (mode)
+  unsigned int size = GET_MODE_SIZE (mode).to_constant ();
+  if ((unsigned HOST_WIDE_INT) offset + size
       > (unsigned HOST_WIDE_INT) TREE_STRING_LENGTH (str))
     {
-      char *p = XALLOCAVEC (char, GET_MODE_SIZE (mode));
+      char *p = XALLOCAVEC (char, size);
       size_t l = TREE_STRING_LENGTH (str) - offset;
       memcpy (p, TREE_STRING_POINTER (str) + offset, l);
-      memset (p + l, '\0', GET_MODE_SIZE (mode) - l);
-      return c_readstr (p, mode, false);
+      memset (p + l, '\0', size - l);
+      return c_readstr (p, as_a <scalar_int_mode> (mode), false);
     }
 
-  return c_readstr (TREE_STRING_POINTER (str) + offset, mode, false);
+  return c_readstr (TREE_STRING_POINTER (str) + offset,
+		    as_a <scalar_int_mode> (mode), false);
 }
 
 /* Generate code for computing expression EXP,
diff --git a/gcc/expr.h b/gcc/expr.h
index a4f44265759..552645e4403 100644
--- a/gcc/expr.h
+++ b/gcc/expr.h
@@ -108,13 +108,13 @@ enum block_op_methods
 };
 
 typedef rtx (*by_pieces_constfn) (void *, void *, HOST_WIDE_INT,
-				  scalar_int_mode);
+				  machine_mode);
 
 /* The second pointer passed to by_pieces_constfn.  */
 struct by_pieces_prev
 {
   rtx data;
-  scalar_int_mode mode;
+  machine_mode mode;
 };
 
 extern rtx emit_block_move (rtx, rtx, rtx, enum block_op_methods);
diff --git a/gcc/target.def b/gcc/target.def
index 2e40448e6c5..cc4aa3a4212 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -2726,6 +2726,13 @@ DEFHOOK
  rtx, (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev, int cmp_code, tree op0, tree op1, int bit_code),
  NULL)
 
+DEFHOOK
+(gen_memset_scratch_rtx,
+ "This hook should return an rtx for scratch register in @var{mode} to\n\
+be used by memset broadcast. The default is @code{gen_reg_rtx}.",
+ rtx, (machine_mode mode),
+ gen_reg_rtx)
+
 /* Return a new value for loop unroll size.  */
 DEFHOOK
 (loop_unroll_adjust,
diff --git a/gcc/testsuite/gcc.target/i386/pr100865-3.c b/gcc/testsuite/gcc.target/i386/pr100865-3.c
index b6dbcf7809b..007e79f91b0 100644
--- a/gcc/testsuite/gcc.target/i386/pr100865-3.c
+++ b/gcc/testsuite/gcc.target/i386/pr100865-3.c
@@ -10,6 +10,6 @@ foo (void)
 }
 
 /* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%(?:r|e)\[^\n\]*, %xmm\[0-9\]+" 1 } } */
-/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%xmm\[0-9\]+, \\(%\[\^,\]+\\)" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqu8\[\\t \]%xmm\[0-9\]+, \\(%\[\^,\]+\\)" 1 } } */
 /* { dg-final { scan-assembler-not "vpbroadcastb\[\\t \]+%xmm\[0-9\]+, %xmm\[0-9\]+" } } */
 /* { dg-final { scan-assembler-not "vmovdqa" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr100865-4b.c b/gcc/testsuite/gcc.target/i386/pr100865-4b.c
index f41e6147b4c..1e50dc842bc 100644
--- a/gcc/testsuite/gcc.target/i386/pr100865-4b.c
+++ b/gcc/testsuite/gcc.target/i386/pr100865-4b.c
@@ -4,6 +4,6 @@
 #include "pr100865-4a.c"
 
 /* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%(?:r|e)\[^\n\]*, %xmm\[0-9\]+" 1 } } */
-/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%xmm\[0-9\]+, " 4 } } */
+/* { dg-final { scan-assembler-times "vmovdqu8\[\\t \]%xmm\[0-9\]+, " 4 } } */
 /* { dg-final { scan-assembler-not "vpbroadcastb\[\\t \]+%xmm\[0-9\]+, %xmm\[0-9\]+" } } */
 /* { dg-final { scan-assembler-not "vmovdqa" } } */
-- 
2.31.1

Reply via email to