On Wed, Jul 21, 2021 at 12:20 PM Richard Sandiford <richard.sandif...@arm.com> wrote: > > "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes: > > On Wed, Jul 21, 2021 at 7:50 AM Richard Sandiford > > <richard.sandif...@arm.com> wrote: > >> > >> "H.J. Lu" <hjl.to...@gmail.com> writes: > >> > diff --git a/gcc/builtins.c b/gcc/builtins.c > >> > index 39ab139b7e1..1972301ce3c 100644 > >> > --- a/gcc/builtins.c > >> > +++ b/gcc/builtins.c > >> > @@ -3890,13 +3890,16 @@ expand_builtin_strnlen (tree exp, rtx target, > >> > machine_mode target_mode) > >> > > >> > static rtx > >> > builtin_memcpy_read_str (void *data, void *, HOST_WIDE_INT offset, > >> > - scalar_int_mode mode) > >> > + fixed_size_mode mode) > >> > { > >> > /* The REPresentation pointed to by DATA need not be a nul-terminated > >> > string but the caller guarantees it's large enough for MODE. */ > >> > const char *rep = (const char *) data; > >> > > >> > - return c_readstr (rep + offset, mode, /*nul_terminated=*/false); > >> > + /* NB: Vector mode in the by-pieces infrastructure is only used by > >> > + the memset expander. */ > >> > >> Sorry to nitpick, but I guess this might get out out-of-date. Maybe: > >> > >> /* The by-pieces infrastructure does not try to pick a vector mode > >> for memcpy expansion. */ > > > > Fixed. > > > >> > + return c_readstr (rep + offset, as_a <scalar_int_mode> (mode), > >> > + /*nul_terminated=*/false); > >> > } > >> > > >> > /* LEN specify length of the block of memcpy/memset operation. > >> > @@ -6478,14 +6481,16 @@ expand_builtin_stpncpy (tree exp, rtx) > >> > > >> > rtx > >> > builtin_strncpy_read_str (void *data, void *, HOST_WIDE_INT offset, > >> > - scalar_int_mode mode) > >> > + fixed_size_mode mode) > >> > { > >> > const char *str = (const char *) data; > >> > > >> > if ((unsigned HOST_WIDE_INT) offset > strlen (str)) > >> > return const0_rtx; > >> > > >> > - return c_readstr (str + offset, mode); > >> > + /* NB: Vector mode in the by-pieces infrastructure is only used by > >> > + the memset expander. */ > >> > >> Similarly here for strncpy expansion. > > > > Fixed. > > > >> > + return c_readstr (str + offset, as_a <scalar_int_mode> (mode)); > >> > } > >> > > >> > /* Helper to check the sizes of sequences and the destination of calls > >> > @@ -6686,30 +6691,117 @@ expand_builtin_strncpy (tree exp, rtx target) > >> > return NULL_RTX; > >> > } > >> > > >> > -/* Callback routine for store_by_pieces. Read GET_MODE_BITSIZE (MODE) > >> > - bytes from constant string DATA + OFFSET and return it as target > >> > - constant. If PREV isn't nullptr, it has the RTL info from the > >> > +/* Return the RTL of a register in MODE generated from PREV in the > >> > previous iteration. */ > >> > > >> > -rtx > >> > -builtin_memset_read_str (void *data, void *prevp, > >> > - HOST_WIDE_INT offset ATTRIBUTE_UNUSED, > >> > - scalar_int_mode mode) > >> > +static rtx > >> > +gen_memset_value_from_prev (by_pieces_prev *prev, fixed_size_mode mode) > >> > { > >> > - by_pieces_prev *prev = (by_pieces_prev *) prevp; > >> > + rtx target = nullptr; > >> > if (prev != nullptr && prev->data != nullptr) > >> > { > >> > /* Use the previous data in the same mode. */ > >> > if (prev->mode == mode) > >> > return prev->data; > >> > + > >> > + fixed_size_mode prev_mode = prev->mode; > >> > + > >> > + /* Don't use the previous data to write QImode if it is in a > >> > + vector mode. */ > >> > + if (VECTOR_MODE_P (prev_mode) && mode == QImode) > >> > + return target; > >> > + > >> > + rtx prev_rtx = prev->data; > >> > + > >> > + if (REG_P (prev_rtx) > >> > + && HARD_REGISTER_P (prev_rtx) > >> > + && lowpart_subreg_regno (REGNO (prev_rtx), prev_mode, mode) < 0) > >> > + { > >> > + /* If we can't put a hard register in MODE, first generate a > >> > + subreg of word mode if the previous mode is wider than word > >> > + mode and word mode is wider than MODE. */ > >> > + if (UNITS_PER_WORD < GET_MODE_SIZE (prev_mode) > >> > + && UNITS_PER_WORD > GET_MODE_SIZE (mode)) > >> > + { > >> > + prev_rtx = lowpart_subreg (word_mode, prev_rtx, > >> > + prev_mode); > >> > + if (prev_rtx != nullptr) > >> > + prev_mode = word_mode; > >> > + } > >> > + else > >> > + prev_rtx = nullptr; > >> > >> I don't understand this. Why not just do the: > >> > >> if (REG_P (prev_rtx) > >> && HARD_REGISTER_P (prev_rtx) > >> && lowpart_subreg_regno (REGNO (prev_rtx), prev_mode, mode) < 0) > >> prev_rtx = copy_to_reg (prev_rtx); > >> > >> that I suggested in the previous review? > > > > But for > > --- > > extern void *ops; > > > > void > > foo (int c) > > { > > __builtin_memset (ops, c, 18); > > } > > --- > > I got > > > > vpbroadcastb %edi, %xmm31 > > vmovdqa64 %xmm31, -24(%rsp) > > movq ops(%rip), %rax > > movzwl -24(%rsp), %edx > > vmovdqu8 %xmm31, (%rax) > > movw %dx, 16(%rax) > > ret > > > > I want to avoid store and load. I am testing > > > > if (REG_P (prev_rtx) > > && HARD_REGISTER_P (prev_rtx) > > && lowpart_subreg_regno (REGNO (prev_rtx), prev_mode, mode) < 0) > > { > > /* Find the smallest subreg mode in the same mode class which > > is not narrower than MODE and narrower than PREV_MODE. */ > > machine_mode m; > > fixed_size_mode candidate; > > FOR_EACH_MODE_IN_CLASS (m, GET_MODE_CLASS (mode)) > > if (is_a<fixed_size_mode> (m, &candidate)) > > { > > if (GET_MODE_SIZE (candidate) > > >= GET_MODE_SIZE (prev_mode)) > > break; > > if (GET_MODE_SIZE (candidate) >= GET_MODE_SIZE (mode) > > && lowpart_subreg_regno (REGNO (prev_rtx), > > prev_mode, candidate) >= 0) > > { > > target = lowpart_subreg (candidate, prev_rtx, > > prev_mode); > > prev_rtx = target; > > prev_mode = candidate; > > break; > > } > > } > > That doesn't seem better though. There are still two subregs involved.
Since a hard register is used, there is only one subreg generated: (insn 7 6 8 2 (set (reg:QI 85) (subreg:QI (reg/v:SI 83 [ c ]) 0)) "s2a.i":6:3 77 {*movqi_internal} (nil)) (insn 8 7 9 2 (set (reg:V16QI 67 xmm31) (vec_duplicate:V16QI (reg:QI 85))) "s2a.i":6:3 5165 {*avx512vl_vec_dup_g prv16qi} (nil)) (insn 9 8 10 2 (set (mem:V16QI (reg/f:DI 84) [0 MEM <char[1:18]> [(void *)ops.0_ 1]+0 S16 A8]) (reg:V16QI 67 xmm31)) "s2a.i":6:3 1574 {movv16qi_internal} (nil)) (insn 10 9 0 2 (set (mem:HI (plus:DI (reg/f:DI 84) (const_int 16 [0x10])) [0 MEM <char[1:18]> [(void *)ops.0_1]+16 S2 A8]) (subreg:HI (reg:SI 67 xmm31) 0)) "s2a.i":6:3 76 {*movhi_internal} (nil)) > > if (target == nullptr) > > prev_rtx = copy_to_reg (prev_rtx); > > } > > > > to get > > > > movq ops(%rip), %rax > > vpbroadcastb %edi, %xmm31 > > vmovd %xmm31, %edx > > movw %dx, 16(%rax) > > vmovdqu8 %xmm31, (%rax) > > ret > > What does the pre-RA RTL look like for the code that you want? WIth my code, I got (insn 8 6 9 2 (set (reg:V16QI 67 xmm31) (vec_duplicate:V16QI (subreg:QI (reg:SI 86) 0))) "s2a.i":6:3 5165 {*avx512vl_vec_dup_gprv16qi} (expr_list:REG_DEAD (reg:SI 86) (nil))) (insn 9 8 10 2 (set (mem:V16QI (reg/f:DI 84 [ ops ]) [0 MEM <char[1:18]> [(void *)ops.0_1]+0 S16 A8]) (reg:V16QI 67 xmm31)) "s2a.i":6:3 1574 {movv16qi_internal} (nil)) (insn 10 9 0 2 (set (mem:HI (plus:DI (reg/f:DI 84 [ ops ]) (const_int 16 [0x10])) [0 MEM <char[1:18]> [(void *)ops.0_1]+16 S2 A8]) (subreg:HI (reg:SI 67 xmm31) 0)) "s2a.i":6:3 76 {*movhi_internal} (expr_list:REG_DEAD (reg/f:DI 84 [ ops ]) (expr_list:REG_DEAD (reg:SI 67 xmm31) (nil)))) With copy_to_reg, I got (insn 8 6 9 2 (set (reg:V16QI 67 xmm31) (vec_duplicate:V16QI (subreg:QI (reg:SI 87) 0))) "s2a.i":6:3 5165 {*avx512vl_vec_dup_gprv16qi} (expr_list:REG_DEAD (reg:SI 87) (nil))) (insn 9 8 15 2 (set (mem:V16QI (reg/f:DI 84 [ ops ]) [0 MEM <char[1:18]> [(void *)ops.0_1]+0 S16 A8]) (reg:V16QI 67 xmm31)) "s2a.i":6:3 1574 {movv16qi_internal} (nil)) (insn 15 9 10 2 (set (reg:V16QI 88) (reg:V16QI 67 xmm31)) "s2a.i":6:3 1574 {movv16qi_internal} (expr_list:REG_DEAD (reg:V16QI 67 xmm31) (nil))) (note 10 15 11 2 NOTE_INSN_DELETED) (insn 11 10 0 2 (set (mem:HI (plus:DI (reg/f:DI 84 [ ops ]) (const_int 16 [0x10])) [0 MEM <char[1:18]> [(void *)ops.0_1]+16 S2 A8]) (subreg:HI (reg:V16QI 88) 0)) "s2a.i":6:3 76 {*movhi_internal} (expr_list:REG_DEAD (reg:V16QI 88) (expr_list:REG_DEAD (reg/f:DI 84 [ ops ]) (nil)))) > Thanks, > Richard -- H.J.