"H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes: > On Wed, Jul 21, 2021 at 7:50 AM Richard Sandiford > <richard.sandif...@arm.com> wrote: >> >> "H.J. Lu" <hjl.to...@gmail.com> writes: >> > diff --git a/gcc/builtins.c b/gcc/builtins.c >> > index 39ab139b7e1..1972301ce3c 100644 >> > --- a/gcc/builtins.c >> > +++ b/gcc/builtins.c >> > @@ -3890,13 +3890,16 @@ expand_builtin_strnlen (tree exp, rtx target, >> > machine_mode target_mode) >> > >> > static rtx >> > builtin_memcpy_read_str (void *data, void *, HOST_WIDE_INT offset, >> > - scalar_int_mode mode) >> > + fixed_size_mode mode) >> > { >> > /* The REPresentation pointed to by DATA need not be a nul-terminated >> > string but the caller guarantees it's large enough for MODE. */ >> > const char *rep = (const char *) data; >> > >> > - return c_readstr (rep + offset, mode, /*nul_terminated=*/false); >> > + /* NB: Vector mode in the by-pieces infrastructure is only used by >> > + the memset expander. */ >> >> Sorry to nitpick, but I guess this might get out out-of-date. Maybe: >> >> /* The by-pieces infrastructure does not try to pick a vector mode >> for memcpy expansion. */ > > Fixed. > >> > + return c_readstr (rep + offset, as_a <scalar_int_mode> (mode), >> > + /*nul_terminated=*/false); >> > } >> > >> > /* LEN specify length of the block of memcpy/memset operation. >> > @@ -6478,14 +6481,16 @@ expand_builtin_stpncpy (tree exp, rtx) >> > >> > rtx >> > builtin_strncpy_read_str (void *data, void *, HOST_WIDE_INT offset, >> > - scalar_int_mode mode) >> > + fixed_size_mode mode) >> > { >> > const char *str = (const char *) data; >> > >> > if ((unsigned HOST_WIDE_INT) offset > strlen (str)) >> > return const0_rtx; >> > >> > - return c_readstr (str + offset, mode); >> > + /* NB: Vector mode in the by-pieces infrastructure is only used by >> > + the memset expander. */ >> >> Similarly here for strncpy expansion. > > Fixed. > >> > + return c_readstr (str + offset, as_a <scalar_int_mode> (mode)); >> > } >> > >> > /* Helper to check the sizes of sequences and the destination of calls >> > @@ -6686,30 +6691,117 @@ expand_builtin_strncpy (tree exp, rtx target) >> > return NULL_RTX; >> > } >> > >> > -/* Callback routine for store_by_pieces. Read GET_MODE_BITSIZE (MODE) >> > - bytes from constant string DATA + OFFSET and return it as target >> > - constant. If PREV isn't nullptr, it has the RTL info from the >> > +/* Return the RTL of a register in MODE generated from PREV in the >> > previous iteration. */ >> > >> > -rtx >> > -builtin_memset_read_str (void *data, void *prevp, >> > - HOST_WIDE_INT offset ATTRIBUTE_UNUSED, >> > - scalar_int_mode mode) >> > +static rtx >> > +gen_memset_value_from_prev (by_pieces_prev *prev, fixed_size_mode mode) >> > { >> > - by_pieces_prev *prev = (by_pieces_prev *) prevp; >> > + rtx target = nullptr; >> > if (prev != nullptr && prev->data != nullptr) >> > { >> > /* Use the previous data in the same mode. */ >> > if (prev->mode == mode) >> > return prev->data; >> > + >> > + fixed_size_mode prev_mode = prev->mode; >> > + >> > + /* Don't use the previous data to write QImode if it is in a >> > + vector mode. */ >> > + if (VECTOR_MODE_P (prev_mode) && mode == QImode) >> > + return target; >> > + >> > + rtx prev_rtx = prev->data; >> > + >> > + if (REG_P (prev_rtx) >> > + && HARD_REGISTER_P (prev_rtx) >> > + && lowpart_subreg_regno (REGNO (prev_rtx), prev_mode, mode) < 0) >> > + { >> > + /* If we can't put a hard register in MODE, first generate a >> > + subreg of word mode if the previous mode is wider than word >> > + mode and word mode is wider than MODE. */ >> > + if (UNITS_PER_WORD < GET_MODE_SIZE (prev_mode) >> > + && UNITS_PER_WORD > GET_MODE_SIZE (mode)) >> > + { >> > + prev_rtx = lowpart_subreg (word_mode, prev_rtx, >> > + prev_mode); >> > + if (prev_rtx != nullptr) >> > + prev_mode = word_mode; >> > + } >> > + else >> > + prev_rtx = nullptr; >> >> I don't understand this. Why not just do the: >> >> if (REG_P (prev_rtx) >> && HARD_REGISTER_P (prev_rtx) >> && lowpart_subreg_regno (REGNO (prev_rtx), prev_mode, mode) < 0) >> prev_rtx = copy_to_reg (prev_rtx); >> >> that I suggested in the previous review? > > But for > --- > extern void *ops; > > void > foo (int c) > { > __builtin_memset (ops, c, 18); > } > --- > I got > > vpbroadcastb %edi, %xmm31 > vmovdqa64 %xmm31, -24(%rsp) > movq ops(%rip), %rax > movzwl -24(%rsp), %edx > vmovdqu8 %xmm31, (%rax) > movw %dx, 16(%rax) > ret > > I want to avoid store and load. I am testing > > if (REG_P (prev_rtx) > && HARD_REGISTER_P (prev_rtx) > && lowpart_subreg_regno (REGNO (prev_rtx), prev_mode, mode) < 0) > { > /* Find the smallest subreg mode in the same mode class which > is not narrower than MODE and narrower than PREV_MODE. */ > machine_mode m; > fixed_size_mode candidate; > FOR_EACH_MODE_IN_CLASS (m, GET_MODE_CLASS (mode)) > if (is_a<fixed_size_mode> (m, &candidate)) > { > if (GET_MODE_SIZE (candidate) > >= GET_MODE_SIZE (prev_mode)) > break; > if (GET_MODE_SIZE (candidate) >= GET_MODE_SIZE (mode) > && lowpart_subreg_regno (REGNO (prev_rtx), > prev_mode, candidate) >= 0) > { > target = lowpart_subreg (candidate, prev_rtx, > prev_mode); > prev_rtx = target; > prev_mode = candidate; > break; > } > }
That doesn't seem better though. There are still two subregs involved. > if (target == nullptr) > prev_rtx = copy_to_reg (prev_rtx); > } > > to get > > movq ops(%rip), %rax > vpbroadcastb %edi, %xmm31 > vmovd %xmm31, %edx > movw %dx, 16(%rax) > vmovdqu8 %xmm31, (%rax) > ret What does the pre-RA RTL look like for the code that you want? Thanks, Richard