On Thu, Jun 3, 2021 at 5:49 AM H.J. Lu <hjl.to...@gmail.com> wrote: > > Update move expanders to convert the CONST_WIDE_INT operand to vector > broadcast from a byte with AVX2. Add ix86_gen_scratch_sse_rtx to > return a scratch SSE register which won't increase stack alignment > requirement and blocks transformation by the combine pass.
Using fixed scratch reg is just too hackish for my taste. The expansion is OK to emit some optimized sequence, but the approach to use fixed reg to bypass stack alignment functionality and combine is not. Perhaps a new insn pattern should be introduced, e.g. (define_insn_and_split "" [(set (match_opreand:V 0 "memory_operand" "=m,m") (vec_duplicate:V (match_operand:S 2 "reg_or_0_operand" "r,C")) (clobber (match_scratch:V 1 "=x"))] and split it at some appropriate point. Uros. > > A small benchmark: > > https://gitlab.com/x86-benchmarks/microbenchmark/-/tree/memset/broadcast > > shows that broadcast is a little bit faster on Intel Core i7-8559U: > > $ make > gcc -g -I. -O2 -c -o test.o test.c > gcc -g -c -o memory.o memory.S > gcc -g -c -o broadcast.o broadcast.S > gcc -o test test.o memory.o broadcast.o > ./test > memory : 99333 > broadcast: 97208 > $ > > broadcast is also smaller: > > $ size memory.o broadcast.o > text data bss dec hex filename > 132 0 0 132 84 memory.o > 122 0 0 122 7a broadcast.o > $ > > gcc/ > > PR target/100865 > * config/i386/i386-expand.c (ix86_expand_vector_init_duplicate): > New prototype. > (ix86_byte_broadcast): New function. > (ix86_convert_const_wide_int_to_broadcast): Likewise. > (ix86_expand_move): Try ix86_convert_const_wide_int_to_broadcast > if mode size is 16 bytes or bigger. > (ix86_expand_vector_move): Try > ix86_convert_const_wide_int_to_broadcast. > * config/i386/i386-protos.h (ix86_gen_scratch_sse_rtx): New > prototype. > * config/i386/i386.c (ix86_minimum_incoming_stack_boundary): Add > an argument to ignore stack_alignment_estimated. It is passed > as false by default. > (ix86_gen_scratch_sse_rtx): New function. > > gcc/testsuite/ > > PR target/100865 > * gcc.target/i386/pr100865-1.c: New test. > * gcc.target/i386/pr100865-2.c: Likewise. > * gcc.target/i386/pr100865-3.c: Likewise. > * gcc.target/i386/pr100865-4.c: Likewise. > * gcc.target/i386/pr100865-5.c: Likewise. > --- > gcc/config/i386/i386-expand.c | 103 ++++++++++++++++++--- > gcc/config/i386/i386-protos.h | 2 + > gcc/config/i386/i386.c | 50 +++++++++- > gcc/testsuite/gcc.target/i386/pr100865-1.c | 13 +++ > gcc/testsuite/gcc.target/i386/pr100865-2.c | 14 +++ > gcc/testsuite/gcc.target/i386/pr100865-3.c | 15 +++ > gcc/testsuite/gcc.target/i386/pr100865-4.c | 16 ++++ > gcc/testsuite/gcc.target/i386/pr100865-5.c | 17 ++++ > 8 files changed, 215 insertions(+), 15 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/i386/pr100865-1.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr100865-2.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr100865-3.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr100865-4.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr100865-5.c > > diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c > index 4185f58eed5..658adafa269 100644 > --- a/gcc/config/i386/i386-expand.c > +++ b/gcc/config/i386/i386-expand.c > @@ -93,6 +93,9 @@ along with GCC; see the file COPYING3. If not see > #include "i386-builtins.h" > #include "i386-expand.h" > > +static bool ix86_expand_vector_init_duplicate (bool, machine_mode, rtx, > + rtx); > + > /* Split one or more double-mode RTL references into pairs of half-mode > references. The RTL can be REG, offsettable MEM, integer constant, or > CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to > @@ -190,6 +193,65 @@ ix86_expand_clear (rtx dest) > emit_insn (tmp); > } > > +/* Return a byte value which V can be broadcasted from. Otherwise, > + return INT_MAX. */ > + > +static int > +ix86_byte_broadcast (HOST_WIDE_INT v) > +{ > + wide_int val = wi::uhwi (v, HOST_BITS_PER_WIDE_INT); > + int byte_broadcast = wi::extract_uhwi (val, 0, BITS_PER_UNIT); > + for (unsigned int i = BITS_PER_UNIT; > + i < HOST_BITS_PER_WIDE_INT; > + i += BITS_PER_UNIT) > + { > + int byte = wi::extract_uhwi (val, i, BITS_PER_UNIT); > + if (byte_broadcast != byte) > + return INT_MAX; > + } > + return byte_broadcast; > +} > + > +/* Convert the CONST_WIDE_INT operand OP to broadcast in MODE. */ > + > +static rtx > +ix86_convert_const_wide_int_to_broadcast (machine_mode mode, rtx op) > +{ > + rtx target = nullptr; > + > + /* Convert CONST_WIDE_INT to broadcast only if vector broadcast is > + available. */ > + if (!TARGET_AVX2 || !CONST_WIDE_INT_P (op)) > + return target; > + > + HOST_WIDE_INT val = CONST_WIDE_INT_ELT (op, 0); > + int byte_broadcast = ix86_byte_broadcast (val); > + > + if (byte_broadcast == INT_MAX) > + return target; > + > + /* Check if OP1 can be broadcasted from VAL. */ > + for (int i = 1; i < CONST_WIDE_INT_NUNITS (op); i++) > + if (val != CONST_WIDE_INT_ELT (op, i)) > + return target; > + > + unsigned int nunits = GET_MODE_SIZE (mode) / GET_MODE_SIZE (QImode); > + machine_mode vector_mode; > + if (!mode_for_vector (QImode, nunits).exists (&vector_mode)) > + gcc_unreachable (); > + target = ix86_gen_scratch_sse_rtx (vector_mode, true); > + rtx byte = GEN_INT ((char) byte_broadcast); > + if (!ix86_expand_vector_init_duplicate (false, vector_mode, target, > + byte)) > + gcc_unreachable (); > + if (REGNO (target) < FIRST_PSEUDO_REGISTER) > + target = gen_rtx_REG (mode, REGNO (target)); > + else > + target = convert_to_mode (mode, target, 1); > + > + return target; > +} > + > void > ix86_expand_move (machine_mode mode, rtx operands[]) > { > @@ -347,20 +409,29 @@ ix86_expand_move (machine_mode mode, rtx operands[]) > && optimize) > op1 = copy_to_mode_reg (mode, op1); > > - if (can_create_pseudo_p () > - && CONST_DOUBLE_P (op1)) > + if (can_create_pseudo_p ()) > { > - /* If we are loading a floating point constant to a register, > - force the value to memory now, since we'll get better code > - out the back end. */ > + if (CONST_DOUBLE_P (op1)) > + { > + /* If we are loading a floating point constant to a > + register, force the value to memory now, since we'll > + get better code out the back end. */ > > - op1 = validize_mem (force_const_mem (mode, op1)); > - if (!register_operand (op0, mode)) > + op1 = validize_mem (force_const_mem (mode, op1)); > + if (!register_operand (op0, mode)) > + { > + rtx temp = gen_reg_rtx (mode); > + emit_insn (gen_rtx_SET (temp, op1)); > + emit_move_insn (op0, temp); > + return; > + } > + } > + else if (GET_MODE_SIZE (mode) >= 16) > { > - rtx temp = gen_reg_rtx (mode); > - emit_insn (gen_rtx_SET (temp, op1)); > - emit_move_insn (op0, temp); > - return; > + rtx tmp = ix86_convert_const_wide_int_to_broadcast > + (GET_MODE (op0), op1); > + if (tmp != nullptr) > + op1 = tmp; > } > } > } > @@ -407,7 +478,15 @@ ix86_expand_vector_move (machine_mode mode, rtx > operands[]) > op1 = simplify_gen_subreg (mode, r, imode, SUBREG_BYTE (op1)); > } > else > - op1 = validize_mem (force_const_mem (mode, op1)); > + { > + machine_mode mode = GET_MODE (op0); > + rtx tmp = ix86_convert_const_wide_int_to_broadcast > + (mode, op1); > + if (tmp == nullptr) > + op1 = validize_mem (force_const_mem (mode, op1)); > + else > + op1 = tmp; > + } > } > > /* We need to check memory alignment for SSE mode since attribute > diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h > index 7782cf1163f..01b3775df92 100644 > --- a/gcc/config/i386/i386-protos.h > +++ b/gcc/config/i386/i386-protos.h > @@ -50,6 +50,8 @@ extern void ix86_reset_previous_fndecl (void); > > extern bool ix86_using_red_zone (void); > > +extern rtx ix86_gen_scratch_sse_rtx (machine_mode, bool); > + > extern unsigned int ix86_regmode_natural_size (machine_mode); > #ifdef RTX_CODE > extern int standard_80387_constant_p (rtx); > diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c > index 04649b42122..3595fb5778e 100644 > --- a/gcc/config/i386/i386.c > +++ b/gcc/config/i386/i386.c > @@ -415,7 +415,8 @@ static unsigned int split_stack_prologue_scratch_regno > (void); > static bool i386_asm_output_addr_const_extra (FILE *, rtx); > > static bool ix86_can_inline_p (tree, tree); > -static unsigned int ix86_minimum_incoming_stack_boundary (bool); > +static unsigned int ix86_minimum_incoming_stack_boundary (bool, > + bool = false); > > > /* Whether -mtune= or -march= were specified */ > @@ -7232,7 +7233,8 @@ find_drap_reg (void) > /* Return minimum incoming stack alignment. */ > > static unsigned int > -ix86_minimum_incoming_stack_boundary (bool sibcall) > +ix86_minimum_incoming_stack_boundary (bool sibcall, > + bool ignore_estimated) > { > unsigned int incoming_stack_boundary; > > @@ -7247,7 +7249,8 @@ ix86_minimum_incoming_stack_boundary (bool sibcall) > estimated stack alignment is 128bit. */ > else if (!sibcall > && ix86_force_align_arg_pointer > - && crtl->stack_alignment_estimated == 128) > + && (ignore_estimated > + || crtl->stack_alignment_estimated == 128)) > incoming_stack_boundary = MIN_STACK_BOUNDARY; > else > incoming_stack_boundary = ix86_default_incoming_stack_boundary; > @@ -23061,6 +23064,47 @@ ix86_optab_supported_p (int op, machine_mode mode1, > machine_mode, > } > } > > +/* Return a scratch register in MODE for vector load and store. If > + CONSTANT_BROADCAST is true, it is used to hold constant broadcast > + result. */ > + > +rtx > +ix86_gen_scratch_sse_rtx (machine_mode mode, bool constant_broadcast) > +{ > + bool need_hard_reg = false; > + > + /* NB: Choose a hard scratch SSE register: > + 1. Avoid increasing stack alignment requirement. > + 2. for constant broadcast in 64-bit mode, avoid transformation by > + the combine pass. > + */ > + if (GET_MODE_SIZE (mode) >= 16 > + && !COMPLEX_MODE_P (mode) > + && (SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))) > + { > + if (constant_broadcast > + && TARGET_64BIT > + && GET_MODE_SIZE (mode) == 16) > + need_hard_reg = true; > + else > + { > + unsigned int incoming_stack_boundary > + = ix86_minimum_incoming_stack_boundary (false, true); > + if (GET_MODE_ALIGNMENT (mode) > incoming_stack_boundary) > + need_hard_reg = true; > + } > + } > + > + rtx target; > + if (need_hard_reg) > + target = gen_rtx_REG (mode, (TARGET_64BIT > + ? LAST_REX_SSE_REG > + : LAST_SSE_REG)); > + else > + target = gen_reg_rtx (mode); > + return target; > +} > + > /* Address space support. > > This is not "far pointers" in the 16-bit sense, but an easy way > diff --git a/gcc/testsuite/gcc.target/i386/pr100865-1.c > b/gcc/testsuite/gcc.target/i386/pr100865-1.c > new file mode 100644 > index 00000000000..6c3097fb2a6 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr100865-1.c > @@ -0,0 +1,13 @@ > +/* { dg-do compile { target { ! ia32 } } } */ > +/* { dg-options "-O2 -march=x86-64" } */ > + > +extern char *dst; > + > +void > +foo (void) > +{ > + __builtin_memset (dst, 3, 16); > +} > + > +/* { dg-final { scan-assembler-times "movdqa\[ \\t\]+\[^\n\]*%xmm" 1 } } */ > +/* { dg-final { scan-assembler-times "movups\[\\t \]%xmm\[0-9\]+, > \\(%\[\^,\]+\\)" 1 } } */ > diff --git a/gcc/testsuite/gcc.target/i386/pr100865-2.c > b/gcc/testsuite/gcc.target/i386/pr100865-2.c > new file mode 100644 > index 00000000000..17efe2d72a3 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr100865-2.c > @@ -0,0 +1,14 @@ > +/* { dg-do compile { target { ! ia32 } } } */ > +/* { dg-options "-O2 -march=skylake" } */ > + > +extern char *dst; > + > +void > +foo (void) > +{ > + __builtin_memset (dst, 3, 16); > +} > + > +/* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%xmm\[0-9\]+, > %xmm\[0-9\]+" 1 } } */ > +/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%xmm\[0-9\]+, > \\(%\[\^,\]+\\)" 1 } } */ > +/* { dg-final { scan-assembler-not "vmovdqa" } } */ > diff --git a/gcc/testsuite/gcc.target/i386/pr100865-3.c > b/gcc/testsuite/gcc.target/i386/pr100865-3.c > new file mode 100644 > index 00000000000..56665d8c9b6 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr100865-3.c > @@ -0,0 +1,15 @@ > +/* { dg-do compile { target { ! ia32 } } } */ > +/* { dg-options "-O2 -march=skylake-avx512" } */ > + > +extern char *dst; > + > +void > +foo (void) > +{ > + __builtin_memset (dst, 3, 16); > +} > + > +/* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%.*, > %xmm\[0-9\]+" 1 } } */ > +/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%xmm\[0-9\]+, > \\(%\[\^,\]+\\)" 1 } } */ > +/* { dg-final { scan-assembler-not "vpbroadcastb\[\\t \]+%xmm\[0-9\]+, > %xmm\[0-9\]+" } } */ > +/* { dg-final { scan-assembler-not "vmovdqa" } } */ > diff --git a/gcc/testsuite/gcc.target/i386/pr100865-4.c > b/gcc/testsuite/gcc.target/i386/pr100865-4.c > new file mode 100644 > index 00000000000..1622a06f57e > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr100865-4.c > @@ -0,0 +1,16 @@ > +/* { dg-do compile { target { ! ia32 } } } */ > +/* { dg-options "-O2 -march=skylake" } */ > + > +extern char array[64]; > + > +void > +foo (void) > +{ > + int i; > + for (i = 0; i < 64; i++) > + array[i] = -45; > +} > + > +/* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%xmm\[0-9\]+, > %xmm\[0-9\]+" 1 } } */ > +/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%xmm\[0-9\]+, " 4 } } */ > +/* { dg-final { scan-assembler-not "vmovdqa" } } */ > diff --git a/gcc/testsuite/gcc.target/i386/pr100865-5.c > b/gcc/testsuite/gcc.target/i386/pr100865-5.c > new file mode 100644 > index 00000000000..66040d9f972 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr100865-5.c > @@ -0,0 +1,17 @@ > +/* { dg-do compile { target { ! ia32 } } } */ > +/* { dg-options "-O2 -march=skylake-avx512" } */ > + > +extern char array[64]; > + > +void > +foo (void) > +{ > + int i; > + for (i = 0; i < 64; i++) > + array[i] = -45; > +} > + > +/* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%.*, > %xmm\[0-9\]+" 1 } } */ > +/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%xmm\[0-9\]+, " 4 } } */ > +/* { dg-final { scan-assembler-not "vpbroadcastb\[\\t \]+%xmm\[0-9\]+, > %xmm\[0-9\]+" } } */ > +/* { dg-final { scan-assembler-not "vmovdqa" } } */ > -- > 2.31.1 >