"H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:
> On Wed, Jul 21, 2021 at 7:50 AM Richard Sandiford
> <richard.sandif...@arm.com> wrote:
>>
>> "H.J. Lu" <hjl.to...@gmail.com> writes:
>> > diff --git a/gcc/builtins.c b/gcc/builtins.c
>> > index 39ab139b7e1..1972301ce3c 100644
>> > --- a/gcc/builtins.c
>> > +++ b/gcc/builtins.c
>> > @@ -3890,13 +3890,16 @@ expand_builtin_strnlen (tree exp, rtx target, 
>> > machine_mode target_mode)
>> >
>> >  static rtx
>> >  builtin_memcpy_read_str (void *data, void *, HOST_WIDE_INT offset,
>> > -                      scalar_int_mode mode)
>> > +                      fixed_size_mode mode)
>> >  {
>> >    /* The REPresentation pointed to by DATA need not be a nul-terminated
>> >       string but the caller guarantees it's large enough for MODE.  */
>> >    const char *rep = (const char *) data;
>> >
>> > -  return c_readstr (rep + offset, mode, /*nul_terminated=*/false);
>> > +  /* NB: Vector mode in the by-pieces infrastructure is only used by
>> > +     the memset expander.  */
>>
>> Sorry to nitpick, but I guess this might get out out-of-date.  Maybe:
>>
>>   /* The by-pieces infrastructure does not try to pick a vector mode
>>      for memcpy expansion.  */
>
> Fixed.
>
>> > +  return c_readstr (rep + offset, as_a <scalar_int_mode> (mode),
>> > +                 /*nul_terminated=*/false);
>> >  }
>> >
>> >  /* LEN specify length of the block of memcpy/memset operation.
>> > @@ -6478,14 +6481,16 @@ expand_builtin_stpncpy (tree exp, rtx)
>> >
>> >  rtx
>> >  builtin_strncpy_read_str (void *data, void *, HOST_WIDE_INT offset,
>> > -                       scalar_int_mode mode)
>> > +                       fixed_size_mode mode)
>> >  {
>> >    const char *str = (const char *) data;
>> >
>> >    if ((unsigned HOST_WIDE_INT) offset > strlen (str))
>> >      return const0_rtx;
>> >
>> > -  return c_readstr (str + offset, mode);
>> > +  /* NB: Vector mode in the by-pieces infrastructure is only used by
>> > +     the memset expander.  */
>>
>> Similarly here for strncpy expansion.
>
> Fixed.
>
>> > +  return c_readstr (str + offset, as_a <scalar_int_mode> (mode));
>> >  }
>> >
>> >  /* Helper to check the sizes of sequences and the destination of calls
>> > @@ -6686,30 +6691,117 @@ expand_builtin_strncpy (tree exp, rtx target)
>> >    return NULL_RTX;
>> >  }
>> >
>> > -/* Callback routine for store_by_pieces.  Read GET_MODE_BITSIZE (MODE)
>> > -   bytes from constant string DATA + OFFSET and return it as target
>> > -   constant.  If PREV isn't nullptr, it has the RTL info from the
>> > +/* Return the RTL of a register in MODE generated from PREV in the
>> >     previous iteration.  */
>> >
>> > -rtx
>> > -builtin_memset_read_str (void *data, void *prevp,
>> > -                      HOST_WIDE_INT offset ATTRIBUTE_UNUSED,
>> > -                      scalar_int_mode mode)
>> > +static rtx
>> > +gen_memset_value_from_prev (by_pieces_prev *prev, fixed_size_mode mode)
>> >  {
>> > -  by_pieces_prev *prev = (by_pieces_prev *) prevp;
>> > +  rtx target = nullptr;
>> >    if (prev != nullptr && prev->data != nullptr)
>> >      {
>> >        /* Use the previous data in the same mode.  */
>> >        if (prev->mode == mode)
>> >       return prev->data;
>> > +
>> > +      fixed_size_mode prev_mode = prev->mode;
>> > +
>> > +      /* Don't use the previous data to write QImode if it is in a
>> > +      vector mode.  */
>> > +      if (VECTOR_MODE_P (prev_mode) && mode == QImode)
>> > +     return target;
>> > +
>> > +      rtx prev_rtx = prev->data;
>> > +
>> > +      if (REG_P (prev_rtx)
>> > +       && HARD_REGISTER_P (prev_rtx)
>> > +       && lowpart_subreg_regno (REGNO (prev_rtx), prev_mode, mode) < 0)
>> > +     {
>> > +       /* If we can't put a hard register in MODE, first generate a
>> > +          subreg of word mode if the previous mode is wider than word
>> > +          mode and word mode is wider than MODE.  */
>> > +       if (UNITS_PER_WORD < GET_MODE_SIZE (prev_mode)
>> > +           && UNITS_PER_WORD > GET_MODE_SIZE (mode))
>> > +         {
>> > +           prev_rtx = lowpart_subreg (word_mode, prev_rtx,
>> > +                                      prev_mode);
>> > +           if (prev_rtx != nullptr)
>> > +             prev_mode = word_mode;
>> > +         }
>> > +       else
>> > +         prev_rtx = nullptr;
>>
>> I don't understand this.  Why not just do the:
>>
>>       if (REG_P (prev_rtx)
>>           && HARD_REGISTER_P (prev_rtx)
>>           && lowpart_subreg_regno (REGNO (prev_rtx), prev_mode, mode) < 0)
>>         prev_rtx = copy_to_reg (prev_rtx);
>>
>> that I suggested in the previous review?
>
> But for
> ---
> extern void *ops;
>
> void
> foo (int c)
> {
>   __builtin_memset (ops, c, 18);
> }
> ---
> I got
>
> vpbroadcastb %edi, %xmm31
> vmovdqa64 %xmm31, -24(%rsp)
> movq ops(%rip), %rax
> movzwl -24(%rsp), %edx
> vmovdqu8 %xmm31, (%rax)
> movw %dx, 16(%rax)
> ret
>
> I want to avoid store and load.  I am testing
>
>       if (REG_P (prev_rtx)
>           && HARD_REGISTER_P (prev_rtx)
>           && lowpart_subreg_regno (REGNO (prev_rtx), prev_mode, mode) < 0)
>         {
>           /* Find the smallest subreg mode in the same mode class which
>              is not narrower than MODE and narrower than PREV_MODE.  */
>           machine_mode m;
>           fixed_size_mode candidate;
>           FOR_EACH_MODE_IN_CLASS (m, GET_MODE_CLASS (mode))
>             if (is_a<fixed_size_mode> (m, &candidate))
>               {
>                 if (GET_MODE_SIZE (candidate)
>                     >= GET_MODE_SIZE (prev_mode))
>                   break;
>                 if (GET_MODE_SIZE (candidate) >= GET_MODE_SIZE (mode)
>                     && lowpart_subreg_regno (REGNO (prev_rtx),
>                                              prev_mode, candidate) >= 0)
>                   {
>                     target = lowpart_subreg (candidate, prev_rtx,
>                                              prev_mode);
>                     prev_rtx = target;
>                     prev_mode = candidate;
>                     break;
>                   }
>               }

That doesn't seem better though.  There are still two subregs involved.

>           if (target == nullptr)
>             prev_rtx = copy_to_reg (prev_rtx);
>         }
>
> to get
>
> movq ops(%rip), %rax
> vpbroadcastb %edi, %xmm31
> vmovd %xmm31, %edx
> movw %dx, 16(%rax)
> vmovdqu8 %xmm31, (%rax)
> ret

What does the pre-RA RTL look like for the code that you want?

Thanks,
Richard

Reply via email to