1. Rewrite builtin_memset_read_str/builtin_memset_gen_str to use vector broadcast to duplicate QI value to TI/OI/XI value for memmset. 2. Add TARGET_GEN_MEMSET_SCRATCH_RTX to allow the backend to use a hard scratch register to avoid stack realignment when expanding memset.
PR middle-end/90773 * builtins.c (gen_memset_value_from_prev): New function. (gen_memset_broadcast): Likewise. (builtin_memset_read_str): Use gen_memset_value_from_prev and gen_memset_broadcast. (builtin_memset_gen_str): Likewise. * target.def (gen_memset_scratch_rtx): New hook. * doc/tm.texi.in: Add TARGET_GEN_MEMSET_SCRATCH_RTX. * doc/tm.texi: Regenerated. --- gcc/builtins.c | 123 +++++++++++++++++++++++++++++++++++++-------- gcc/doc/tm.texi | 5 ++ gcc/doc/tm.texi.in | 2 + gcc/target.def | 7 +++ 4 files changed, 116 insertions(+), 21 deletions(-) diff --git a/gcc/builtins.c b/gcc/builtins.c index e5e39386a93..e938d610f12 100644 --- a/gcc/builtins.c +++ b/gcc/builtins.c @@ -6639,26 +6639,111 @@ expand_builtin_strncpy (tree exp, rtx target) return NULL_RTX; } -/* Callback routine for store_by_pieces. Read GET_MODE_BITSIZE (MODE) - bytes from constant string DATA + OFFSET and return it as target - constant. If PREV isn't nullptr, it has the RTL info from the +/* Return the RTL of a register in MODE generated from PREV in the previous iteration. */ -rtx -builtin_memset_read_str (void *data, void *prevp, - HOST_WIDE_INT offset ATTRIBUTE_UNUSED, - scalar_int_mode mode) +static rtx +gen_memset_value_from_prev (void *prevp, scalar_int_mode mode) { + rtx target = nullptr; by_pieces_prev *prev = (by_pieces_prev *) prevp; if (prev != nullptr && prev->data != nullptr) { /* Use the previous data in the same mode. */ if (prev->mode == mode) return prev->data; + + rtx prev_rtx = prev->data; + machine_mode prev_mode = prev->mode; + unsigned int word_size = GET_MODE_SIZE (word_mode); + if (word_size < GET_MODE_SIZE (prev->mode) + && word_size > GET_MODE_SIZE (mode)) + { + /* First generate subreg of word mode if the previous mode is + wider than word mode and word mode is wider than MODE. */ + prev_rtx = simplify_gen_subreg (word_mode, prev_rtx, + prev_mode, 0); + prev_mode = word_mode; + } + if (prev_rtx != nullptr) + target = simplify_gen_subreg (mode, prev_rtx, prev_mode, 0); } + return target; +} + +/* Return the RTL of a register in MODE broadcasted from DATA. */ + +static rtx +gen_memset_broadcast (rtx data, scalar_int_mode mode) +{ + /* Skip if regno_reg_rtx isn't initialized. */ + if (!regno_reg_rtx) + return nullptr; + + rtx target = nullptr; + + unsigned int nunits = GET_MODE_SIZE (mode) / GET_MODE_SIZE (QImode); + machine_mode vector_mode; + if (!mode_for_vector (QImode, nunits).exists (&vector_mode)) + gcc_unreachable (); + + enum insn_code icode = optab_handler (vec_duplicate_optab, + vector_mode); + if (icode != CODE_FOR_nothing) + { + rtx reg = targetm.gen_memset_scratch_rtx (vector_mode); + if (CONST_INT_P (data)) + { + /* Use the move expander with CONST_VECTOR. */ + rtx const_vec = gen_const_vec_duplicate (vector_mode, data); + emit_move_insn (reg, const_vec); + } + else + { + + class expand_operand ops[2]; + create_output_operand (&ops[0], reg, vector_mode); + create_input_operand (&ops[1], data, QImode); + expand_insn (icode, 2, ops); + if (!rtx_equal_p (reg, ops[0].value)) + emit_move_insn (reg, ops[0].value); + } + target = lowpart_subreg (mode, reg, vector_mode); + } + + return target; +} + +/* Callback routine for store_by_pieces. Read GET_MODE_BITSIZE (MODE) + bytes from constant string DATA + OFFSET and return it as target + constant. If PREV isn't nullptr, it has the RTL info from the + previous iteration. */ +rtx +builtin_memset_read_str (void *data, void *prev, + HOST_WIDE_INT offset ATTRIBUTE_UNUSED, + scalar_int_mode mode) +{ + rtx target; const char *c = (const char *) data; - char *p = XALLOCAVEC (char, GET_MODE_SIZE (mode)); + char *p; + + /* Don't use the previous value if size is 1. */ + if (GET_MODE_SIZE (mode) != 1) + { + target = gen_memset_value_from_prev (prev, mode); + if (target != nullptr) + return target; + + p = XALLOCAVEC (char, GET_MODE_SIZE (QImode)); + memset (p, *c, GET_MODE_SIZE (QImode)); + rtx src = c_readstr (p, QImode); + target = gen_memset_broadcast (src, mode); + if (target != nullptr) + return target; + } + + p = XALLOCAVEC (char, GET_MODE_SIZE (mode)); memset (p, *c, GET_MODE_SIZE (mode)); @@ -6672,7 +6757,7 @@ builtin_memset_read_str (void *data, void *prevp, nullptr, it has the RTL info from the previous iteration. */ static rtx -builtin_memset_gen_str (void *data, void *prevp, +builtin_memset_gen_str (void *data, void *prev, HOST_WIDE_INT offset ATTRIBUTE_UNUSED, scalar_int_mode mode) { @@ -6680,22 +6765,18 @@ builtin_memset_gen_str (void *data, void *prevp, size_t size; char *p; - by_pieces_prev *prev = (by_pieces_prev *) prevp; - if (prev != nullptr && prev->data != nullptr) - { - /* Use the previous data in the same mode. */ - if (prev->mode == mode) - return prev->data; - - target = simplify_gen_subreg (mode, prev->data, prev->mode, 0); - if (target != nullptr) - return target; - } - size = GET_MODE_SIZE (mode); if (size == 1) return (rtx) data; + target = gen_memset_value_from_prev (prev, mode); + if (target != nullptr) + return target; + + target = gen_memset_broadcast ((rtx) data, mode); + if (target != nullptr) + return target; + p = XALLOCAVEC (char, size); memset (p, 1, size); coeff = c_readstr (p, mode); diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi index 2a41ae5fba1..8849711fcf8 100644 --- a/gcc/doc/tm.texi +++ b/gcc/doc/tm.texi @@ -12122,6 +12122,11 @@ This function prepares to emit a conditional comparison within a sequence @var{bit_code} is @code{AND} or @code{IOR}, which is the op on the compares. @end deftypefn +@deftypefn {Target Hook} rtx TARGET_GEN_MEMSET_SCRATCH_RTX (machine_mode @var{mode}) +This hook should return an rtx for scratch register in @var{mode} to +be used by memset broadcast. The default is @code{gen_reg_rtx}. +@end deftypefn + @deftypefn {Target Hook} unsigned TARGET_LOOP_UNROLL_ADJUST (unsigned @var{nunroll}, class loop *@var{loop}) This target hook returns a new value for the number of times @var{loop} should be unrolled. The parameter @var{nunroll} is the number of times diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in index f881cdabe9e..a6bbf4f2667 100644 --- a/gcc/doc/tm.texi.in +++ b/gcc/doc/tm.texi.in @@ -7958,6 +7958,8 @@ lists. @hook TARGET_GEN_CCMP_NEXT +@hook TARGET_GEN_MEMSET_SCRATCH_RTX + @hook TARGET_LOOP_UNROLL_ADJUST @defmac POWI_MAX_MULTS diff --git a/gcc/target.def b/gcc/target.def index c009671c583..5fb287db3bd 100644 --- a/gcc/target.def +++ b/gcc/target.def @@ -2726,6 +2726,13 @@ DEFHOOK rtx, (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev, int cmp_code, tree op0, tree op1, int bit_code), NULL) +DEFHOOK +(gen_memset_scratch_rtx, + "This hook should return an rtx for scratch register in @var{mode} to\n\ +be used by memset broadcast. The default is @code{gen_reg_rtx}.", + rtx, (machine_mode mode), + gen_reg_rtx) + /* Return a new value for loop unroll size. */ DEFHOOK (loop_unroll_adjust, -- 2.31.1