On Wed, Jul 21, 2021 at 12:20 PM Richard Sandiford
<richard.sandif...@arm.com> wrote:
>
> "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:
> > On Wed, Jul 21, 2021 at 7:50 AM Richard Sandiford
> > <richard.sandif...@arm.com> wrote:
> >>
> >> "H.J. Lu" <hjl.to...@gmail.com> writes:
> >> > diff --git a/gcc/builtins.c b/gcc/builtins.c
> >> > index 39ab139b7e1..1972301ce3c 100644
> >> > --- a/gcc/builtins.c
> >> > +++ b/gcc/builtins.c
> >> > @@ -3890,13 +3890,16 @@ expand_builtin_strnlen (tree exp, rtx target, 
> >> > machine_mode target_mode)
> >> >
> >> >  static rtx
> >> >  builtin_memcpy_read_str (void *data, void *, HOST_WIDE_INT offset,
> >> > -                      scalar_int_mode mode)
> >> > +                      fixed_size_mode mode)
> >> >  {
> >> >    /* The REPresentation pointed to by DATA need not be a nul-terminated
> >> >       string but the caller guarantees it's large enough for MODE.  */
> >> >    const char *rep = (const char *) data;
> >> >
> >> > -  return c_readstr (rep + offset, mode, /*nul_terminated=*/false);
> >> > +  /* NB: Vector mode in the by-pieces infrastructure is only used by
> >> > +     the memset expander.  */
> >>
> >> Sorry to nitpick, but I guess this might get out out-of-date.  Maybe:
> >>
> >>   /* The by-pieces infrastructure does not try to pick a vector mode
> >>      for memcpy expansion.  */
> >
> > Fixed.
> >
> >> > +  return c_readstr (rep + offset, as_a <scalar_int_mode> (mode),
> >> > +                 /*nul_terminated=*/false);
> >> >  }
> >> >
> >> >  /* LEN specify length of the block of memcpy/memset operation.
> >> > @@ -6478,14 +6481,16 @@ expand_builtin_stpncpy (tree exp, rtx)
> >> >
> >> >  rtx
> >> >  builtin_strncpy_read_str (void *data, void *, HOST_WIDE_INT offset,
> >> > -                       scalar_int_mode mode)
> >> > +                       fixed_size_mode mode)
> >> >  {
> >> >    const char *str = (const char *) data;
> >> >
> >> >    if ((unsigned HOST_WIDE_INT) offset > strlen (str))
> >> >      return const0_rtx;
> >> >
> >> > -  return c_readstr (str + offset, mode);
> >> > +  /* NB: Vector mode in the by-pieces infrastructure is only used by
> >> > +     the memset expander.  */
> >>
> >> Similarly here for strncpy expansion.
> >
> > Fixed.
> >
> >> > +  return c_readstr (str + offset, as_a <scalar_int_mode> (mode));
> >> >  }
> >> >
> >> >  /* Helper to check the sizes of sequences and the destination of calls
> >> > @@ -6686,30 +6691,117 @@ expand_builtin_strncpy (tree exp, rtx target)
> >> >    return NULL_RTX;
> >> >  }
> >> >
> >> > -/* Callback routine for store_by_pieces.  Read GET_MODE_BITSIZE (MODE)
> >> > -   bytes from constant string DATA + OFFSET and return it as target
> >> > -   constant.  If PREV isn't nullptr, it has the RTL info from the
> >> > +/* Return the RTL of a register in MODE generated from PREV in the
> >> >     previous iteration.  */
> >> >
> >> > -rtx
> >> > -builtin_memset_read_str (void *data, void *prevp,
> >> > -                      HOST_WIDE_INT offset ATTRIBUTE_UNUSED,
> >> > -                      scalar_int_mode mode)
> >> > +static rtx
> >> > +gen_memset_value_from_prev (by_pieces_prev *prev, fixed_size_mode mode)
> >> >  {
> >> > -  by_pieces_prev *prev = (by_pieces_prev *) prevp;
> >> > +  rtx target = nullptr;
> >> >    if (prev != nullptr && prev->data != nullptr)
> >> >      {
> >> >        /* Use the previous data in the same mode.  */
> >> >        if (prev->mode == mode)
> >> >       return prev->data;
> >> > +
> >> > +      fixed_size_mode prev_mode = prev->mode;
> >> > +
> >> > +      /* Don't use the previous data to write QImode if it is in a
> >> > +      vector mode.  */
> >> > +      if (VECTOR_MODE_P (prev_mode) && mode == QImode)
> >> > +     return target;
> >> > +
> >> > +      rtx prev_rtx = prev->data;
> >> > +
> >> > +      if (REG_P (prev_rtx)
> >> > +       && HARD_REGISTER_P (prev_rtx)
> >> > +       && lowpart_subreg_regno (REGNO (prev_rtx), prev_mode, mode) < 0)
> >> > +     {
> >> > +       /* If we can't put a hard register in MODE, first generate a
> >> > +          subreg of word mode if the previous mode is wider than word
> >> > +          mode and word mode is wider than MODE.  */
> >> > +       if (UNITS_PER_WORD < GET_MODE_SIZE (prev_mode)
> >> > +           && UNITS_PER_WORD > GET_MODE_SIZE (mode))
> >> > +         {
> >> > +           prev_rtx = lowpart_subreg (word_mode, prev_rtx,
> >> > +                                      prev_mode);
> >> > +           if (prev_rtx != nullptr)
> >> > +             prev_mode = word_mode;
> >> > +         }
> >> > +       else
> >> > +         prev_rtx = nullptr;
> >>
> >> I don't understand this.  Why not just do the:
> >>
> >>       if (REG_P (prev_rtx)
> >>           && HARD_REGISTER_P (prev_rtx)
> >>           && lowpart_subreg_regno (REGNO (prev_rtx), prev_mode, mode) < 0)
> >>         prev_rtx = copy_to_reg (prev_rtx);
> >>
> >> that I suggested in the previous review?
> >
> > But for
> > ---
> > extern void *ops;
> >
> > void
> > foo (int c)
> > {
> >   __builtin_memset (ops, c, 18);
> > }
> > ---
> > I got
> >
> > vpbroadcastb %edi, %xmm31
> > vmovdqa64 %xmm31, -24(%rsp)
> > movq ops(%rip), %rax
> > movzwl -24(%rsp), %edx
> > vmovdqu8 %xmm31, (%rax)
> > movw %dx, 16(%rax)
> > ret
> >
> > I want to avoid store and load.  I am testing
> >
> >       if (REG_P (prev_rtx)
> >           && HARD_REGISTER_P (prev_rtx)
> >           && lowpart_subreg_regno (REGNO (prev_rtx), prev_mode, mode) < 0)
> >         {
> >           /* Find the smallest subreg mode in the same mode class which
> >              is not narrower than MODE and narrower than PREV_MODE.  */
> >           machine_mode m;
> >           fixed_size_mode candidate;
> >           FOR_EACH_MODE_IN_CLASS (m, GET_MODE_CLASS (mode))
> >             if (is_a<fixed_size_mode> (m, &candidate))
> >               {
> >                 if (GET_MODE_SIZE (candidate)
> >                     >= GET_MODE_SIZE (prev_mode))
> >                   break;
> >                 if (GET_MODE_SIZE (candidate) >= GET_MODE_SIZE (mode)
> >                     && lowpart_subreg_regno (REGNO (prev_rtx),
> >                                              prev_mode, candidate) >= 0)
> >                   {
> >                     target = lowpart_subreg (candidate, prev_rtx,
> >                                              prev_mode);
> >                     prev_rtx = target;
> >                     prev_mode = candidate;
> >                     break;
> >                   }
> >               }
>
> That doesn't seem better though.  There are still two subregs involved.

Since a hard register is used, there is only one subreg generated:

(insn 7 6 8 2 (set (reg:QI 85)
        (subreg:QI (reg/v:SI 83 [ c ]) 0)) "s2a.i":6:3 77 {*movqi_internal}
     (nil))
(insn 8 7 9 2 (set (reg:V16QI 67 xmm31)
        (vec_duplicate:V16QI (reg:QI 85))) "s2a.i":6:3 5165 {*avx512vl_vec_dup_g
prv16qi}
     (nil))
(insn 9 8 10 2 (set (mem:V16QI (reg/f:DI 84) [0 MEM <char[1:18]> [(void *)ops.0_
1]+0 S16 A8])
        (reg:V16QI 67 xmm31)) "s2a.i":6:3 1574 {movv16qi_internal}
     (nil))
(insn 10 9 0 2 (set (mem:HI (plus:DI (reg/f:DI 84)
                (const_int 16 [0x10])) [0 MEM <char[1:18]> [(void *)ops.0_1]+16
S2 A8])
        (subreg:HI (reg:SI 67 xmm31) 0)) "s2a.i":6:3 76 {*movhi_internal}
     (nil))

> >           if (target == nullptr)
> >             prev_rtx = copy_to_reg (prev_rtx);
> >         }
> >
> > to get
> >
> > movq ops(%rip), %rax
> > vpbroadcastb %edi, %xmm31
> > vmovd %xmm31, %edx
> > movw %dx, 16(%rax)
> > vmovdqu8 %xmm31, (%rax)
> > ret
>
> What does the pre-RA RTL look like for the code that you want?

WIth my code,  I got

(insn 8 6 9 2 (set (reg:V16QI 67 xmm31)
        (vec_duplicate:V16QI (subreg:QI (reg:SI 86) 0))) "s2a.i":6:3
5165 {*avx512vl_vec_dup_gprv16qi}
     (expr_list:REG_DEAD (reg:SI 86)
        (nil)))
(insn 9 8 10 2 (set (mem:V16QI (reg/f:DI 84 [ ops ]) [0 MEM
<char[1:18]> [(void *)ops.0_1]+0 S16 A8])
        (reg:V16QI 67 xmm31)) "s2a.i":6:3 1574 {movv16qi_internal}
     (nil))
(insn 10 9 0 2 (set (mem:HI (plus:DI (reg/f:DI 84 [ ops ])
                (const_int 16 [0x10])) [0 MEM <char[1:18]> [(void
*)ops.0_1]+16 S2 A8])
        (subreg:HI (reg:SI 67 xmm31) 0)) "s2a.i":6:3 76 {*movhi_internal}
     (expr_list:REG_DEAD (reg/f:DI 84 [ ops ])
        (expr_list:REG_DEAD (reg:SI 67 xmm31)
            (nil))))

With copy_to_reg, I got

(insn 8 6 9 2 (set (reg:V16QI 67 xmm31)
        (vec_duplicate:V16QI (subreg:QI (reg:SI 87) 0))) "s2a.i":6:3
5165 {*avx512vl_vec_dup_gprv16qi}
     (expr_list:REG_DEAD (reg:SI 87)
        (nil)))
(insn 9 8 15 2 (set (mem:V16QI (reg/f:DI 84 [ ops ]) [0 MEM
<char[1:18]> [(void *)ops.0_1]+0 S16 A8])
        (reg:V16QI 67 xmm31)) "s2a.i":6:3 1574 {movv16qi_internal}
     (nil))
(insn 15 9 10 2 (set (reg:V16QI 88)
        (reg:V16QI 67 xmm31)) "s2a.i":6:3 1574 {movv16qi_internal}
     (expr_list:REG_DEAD (reg:V16QI 67 xmm31)
        (nil)))
(note 10 15 11 2 NOTE_INSN_DELETED)
(insn 11 10 0 2 (set (mem:HI (plus:DI (reg/f:DI 84 [ ops ])
                (const_int 16 [0x10])) [0 MEM <char[1:18]> [(void
*)ops.0_1]+16 S2 A8])
        (subreg:HI (reg:V16QI 88) 0)) "s2a.i":6:3 76 {*movhi_internal}
     (expr_list:REG_DEAD (reg:V16QI 88)
        (expr_list:REG_DEAD (reg/f:DI 84 [ ops ])
            (nil))))


> Thanks,
> Richard



-- 
H.J.

Reply via email to