On Tue, Jul 13, 2021 at 9:35 PM Hongtao Liu <crazy...@gmail.com> wrote: > > On Wed, Jul 14, 2021 at 10:34 AM liuhongt <hongtao....@intel.com> wrote: > > > > By optimizing vector movement to broadcast in ix86_expand_vector_move > > during pass_expand, pass_reload/LRA can automatically generate an avx512 > > embedded broadcast, pass_cpb is not needed. > > > > Considering that in the absence of avx512f, broadcast from memory is > > still slightly faster than loading the entire memory, so always enable > > broadcast. > > > > benchmark: > > https://gitlab.com/x86-benchmarks/microbenchmark/-/tree/vaddps/broadcast > > > > The performance diff > > > > strategy : cycles > > memory : 1046611188 > > memory : 1255420817 > > memory : 1044720793 > > memory : 1253414145 > > average : 1097868397 > > > > broadcast : 1044430688 > > broadcast : 1044477630 > > broadcast : 1253554603 > > broadcast : 1044561934 > > average : 1096756213 > > > > But however broadcast has larger size. > > > > the size diff > > > > size broadcast.o > > text data bss dec hex filename > > 137 0 0 137 89 broadcast.o > > > > size memory.o > > text data bss dec hex filename > > 115 0 0 115 73 memory.o > > > > Bootstrapped and regtested on x86_64-linux-gnu{-m32,} > > > > gcc/ChangeLog: > > > > * config/i386/i386-expand.c > > (ix86_broadcast_from_integer_constant): Rename to .. > > (ix86_broadcast_from_constant): .. this, and extend it to > > handle float mode. > > (ix86_expand_vector_move): Extend to float mode. > > * config/i386/i386-features.c > > (replace_constant_pool_with_broadcast): Remove. > > (remove_partial_avx_dependency_gate): Ditto. > > (constant_pool_broadcast): Ditto. > > (class pass_constant_pool_broadcast): Ditto. > > (make_pass_constant_pool_broadcast): Ditto. > > (remove_partial_avx_dependency): Adjust gate. > > * config/i386/i386-passes.def: Remove pass_constant_pool_broadcast. > > * config/i386/i386-protos.h > > (make_pass_constant_pool_broadcast): Remove. > > > > gcc/testsuite/ChangeLog: > > > > * gcc.target/i386/fuse-caller-save-xmm.c: Adjust testcase. > > --- > > gcc/config/i386/i386-expand.c | 29 +++- > > gcc/config/i386/i386-features.c | 157 +----------------- > > gcc/config/i386/i386-passes.def | 1 - > > gcc/config/i386/i386-protos.h | 1 - > > .../gcc.target/i386/fuse-caller-save-xmm.c | 2 +- > > 5 files changed, 26 insertions(+), 164 deletions(-) > > > > diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c > > index 69ea79e6123..ba870145acd 100644 > > --- a/gcc/config/i386/i386-expand.c > > +++ b/gcc/config/i386/i386-expand.c > > @@ -453,8 +453,10 @@ ix86_expand_move (machine_mode mode, rtx operands[]) > > emit_insn (gen_rtx_SET (op0, op1)); > > } > > > > +/* OP is a memref of CONST_VECTOR, return scalar constant mem > > + if CONST_VECTOR is a vec_duplicate, else return NULL. */ > > static rtx > > -ix86_broadcast_from_integer_constant (machine_mode mode, rtx op) > > +ix86_broadcast_from_constant (machine_mode mode, rtx op) > > { > > int nunits = GET_MODE_NUNITS (mode); > > if (nunits < 2) > > @@ -462,7 +464,8 @@ ix86_broadcast_from_integer_constant (machine_mode > > mode, rtx op) > > > > /* Don't use integer vector broadcast if we can't move from GPR to SSE > > register directly. */ > > - if (!TARGET_INTER_UNIT_MOVES_TO_VEC) > > + if (!TARGET_INTER_UNIT_MOVES_TO_VEC > > + && INTEGRAL_MODE_P (mode)) > > return nullptr; > > > > /* Convert CONST_VECTOR to a non-standard SSE constant integer > > @@ -470,12 +473,17 @@ ix86_broadcast_from_integer_constant (machine_mode > > mode, rtx op) > > if (!(TARGET_AVX2 > > || (TARGET_AVX > > && (GET_MODE_INNER (mode) == SImode > > - || GET_MODE_INNER (mode) == DImode))) > > + || GET_MODE_INNER (mode) == DImode)) > > + || FLOAT_MODE_P (mode)) > > || standard_sse_constant_p (op, mode)) > > return nullptr; > > > > - /* Don't broadcast from a 64-bit integer constant in 32-bit mode. */ > > - if (GET_MODE_INNER (mode) == DImode && !TARGET_64BIT) > > + /* Don't broadcast from a 64-bit integer constant in 32-bit mode. > > + We can still put 64-bit integer constant in memory when > > + avx512 embed broadcast is available. */ > > + if (GET_MODE_INNER (mode) == DImode && !TARGET_64BIT > > + && (!TARGET_AVX512F > > + || (GET_MODE_SIZE (mode) < 64 && !TARGET_AVX512VL))) > > return nullptr; > > > > if (GET_MODE_INNER (mode) == TImode) > > @@ -561,17 +569,20 @@ ix86_expand_vector_move (machine_mode mode, rtx > > operands[]) > > > > if (can_create_pseudo_p () > > && GET_MODE_SIZE (mode) >= 16 > > - && GET_MODE_CLASS (mode) == MODE_VECTOR_INT > > + && VECTOR_MODE_P (mode) > > && (MEM_P (op1) > > && SYMBOL_REF_P (XEXP (op1, 0)) > > && CONSTANT_POOL_ADDRESS_P (XEXP (op1, 0)))) > > { > > - rtx first = ix86_broadcast_from_integer_constant (mode, op1); > > + rtx first = ix86_broadcast_from_constant (mode, op1); > > if (first != nullptr) > > { > > /* Broadcast to XMM/YMM/ZMM register from an integer > > - constant. */ > > - op1 = ix86_gen_scratch_sse_rtx (mode); > > + constant or scalar mem. */ > > + op1 = gen_reg_rtx (mode); > I've changed ix86_gen_scratch_sse_rtx to gen_reg_rtx to let LRA/reload > make the final decision for register usage, would that make sense?
Hard registers are used for 2 purposes: 1. Prevent stack realignment when the original code doesn't use vector registers, which is the same for memcpy and memset. 2. Prevent combine to convert constant broadcast to load from constant pool. Please add an argument to ix86_gen_scratch_sse_rtx to use hard registers for the above 2 cases. You can get pseudo registers for your use cases. > w/o that, hard register used here will prevent LRA/reload combine (set > op1 (vec_duplicate: mem)) and (add (op1, op2)) to (add (op1, > vec_duplicate: mem)) > > > + if (FLOAT_MODE_P (mode) > > + || (!TARGET_64BIT && GET_MODE_INNER (mode) == DImode)) > > + first = force_const_mem (GET_MODE_INNER (mode), first); > > bool ok = ix86_expand_vector_init_duplicate (false, mode, > > op1, first); > > gcc_assert (ok); > > diff --git a/gcc/config/i386/i386-features.c > > b/gcc/config/i386/i386-features.c > > index cbd430a2ecf..d9c6652d1c9 100644 > > --- a/gcc/config/i386/i386-features.c > > +++ b/gcc/config/i386/i386-features.c > > @@ -2136,81 +2136,6 @@ make_pass_insert_endbr_and_patchable_area > > (gcc::context *ctxt) > > return new pass_insert_endbr_and_patchable_area (ctxt); > > } > > > > -/* Replace all one-value const vector that are referenced by SYMBOL_REFs > > in x > > - with embedded broadcast. i.e.transform > > - > > - vpaddq .LC0(%rip), %zmm0, %zmm0 > > - ret > > - .LC0: > > - .quad 3 > > - .quad 3 > > - .quad 3 > > - .quad 3 > > - .quad 3 > > - .quad 3 > > - .quad 3 > > - .quad 3 > > - > > - to > > - > > - vpaddq .LC0(%rip){1to8}, %zmm0, %zmm0 > > - ret > > - .LC0: > > - .quad 3 */ > > -static void > > -replace_constant_pool_with_broadcast (rtx_insn *insn) > > -{ > > - subrtx_ptr_iterator::array_type array; > > - FOR_EACH_SUBRTX_PTR (iter, array, &PATTERN (insn), ALL) > > - { > > - rtx *loc = *iter; > > - rtx x = *loc; > > - rtx broadcast_mem, vec_dup, constant, first; > > - machine_mode mode; > > - > > - /* Constant pool. */ > > - if (!MEM_P (x) > > - || !SYMBOL_REF_P (XEXP (x, 0)) > > - || !CONSTANT_POOL_ADDRESS_P (XEXP (x, 0))) > > - continue; > > - > > - /* Const vector. */ > > - mode = GET_MODE (x); > > - if (!VECTOR_MODE_P (mode)) > > - return; > > - constant = get_pool_constant (XEXP (x, 0)); > > - if (GET_CODE (constant) != CONST_VECTOR) > > - return; > > - > > - /* There could be some rtx like > > - (mem/u/c:V16QI (symbol_ref/u:DI ("*.LC1"))) > > - but with "*.LC1" refer to V2DI constant vector. */ > > - if (GET_MODE (constant) != mode) > > - { > > - constant = simplify_subreg (mode, constant, GET_MODE (constant), > > 0); > > - if (constant == NULL_RTX || GET_CODE (constant) != CONST_VECTOR) > > - return; > > - } > > - first = XVECEXP (constant, 0, 0); > > - > > - for (int i = 1; i < GET_MODE_NUNITS (mode); ++i) > > - { > > - rtx tmp = XVECEXP (constant, 0, i); > > - /* Vector duplicate value. */ > > - if (!rtx_equal_p (tmp, first)) > > - return; > > - } > > - > > - /* Replace with embedded broadcast. */ > > - broadcast_mem = force_const_mem (GET_MODE_INNER (mode), first); > > - vec_dup = gen_rtx_VEC_DUPLICATE (mode, broadcast_mem); > > - validate_change (insn, loc, vec_dup, 0); > > - > > - /* At most 1 memory_operand in an insn. */ > > - return; > > - } > > -} > > - > > /* At entry of the nearest common dominator for basic blocks with > > conversions, generate a single > > vxorps %xmmN, %xmmN, %xmmN > > @@ -2249,10 +2174,6 @@ remove_partial_avx_dependency (void) > > if (!NONDEBUG_INSN_P (insn)) > > continue; > > > > - /* Handle AVX512 embedded broadcast here to save compile time. */ > > - if (TARGET_AVX512F) > > - replace_constant_pool_with_broadcast (insn); > > - > > set = single_set (insn); > > if (!set) > > continue; > > @@ -2384,16 +2305,6 @@ remove_partial_avx_dependency (void) > > return 0; > > } > > > > -static bool > > -remove_partial_avx_dependency_gate () > > -{ > > - return (TARGET_AVX > > - && TARGET_SSE_PARTIAL_REG_DEPENDENCY > > - && TARGET_SSE_MATH > > - && optimize > > - && optimize_function_for_speed_p (cfun)); > > -} > > - > > namespace { > > > > const pass_data pass_data_remove_partial_avx_dependency = > > @@ -2419,7 +2330,11 @@ public: > > /* opt_pass methods: */ > > virtual bool gate (function *) > > { > > - return remove_partial_avx_dependency_gate (); > > + return (TARGET_AVX > > + && TARGET_SSE_PARTIAL_REG_DEPENDENCY > > + && TARGET_SSE_MATH > > + && optimize > > + && optimize_function_for_speed_p (cfun)); > > } > > > > virtual unsigned int execute (function *) > > @@ -2436,68 +2351,6 @@ make_pass_remove_partial_avx_dependency > > (gcc::context *ctxt) > > return new pass_remove_partial_avx_dependency (ctxt); > > } > > > > -/* For const vector having one duplicated value, there's no need to put > > - whole vector in the constant pool when target supports embedded > > broadcast. */ > > -static unsigned int > > -constant_pool_broadcast (void) > > -{ > > - timevar_push (TV_MACH_DEP); > > - rtx_insn *insn; > > - > > - for (insn = get_insns (); insn; insn = NEXT_INSN (insn)) > > - { > > - if (INSN_P (insn)) > > - replace_constant_pool_with_broadcast (insn); > > - } > > - timevar_pop (TV_MACH_DEP); > > - return 0; > > -} > > - > > -namespace { > > - > > -const pass_data pass_data_constant_pool_broadcast = > > -{ > > - RTL_PASS, /* type */ > > - "cpb", /* name */ > > - OPTGROUP_NONE, /* optinfo_flags */ > > - TV_MACH_DEP, /* tv_id */ > > - 0, /* properties_required */ > > - 0, /* properties_provided */ > > - 0, /* properties_destroyed */ > > - 0, /* todo_flags_start */ > > - TODO_df_finish, /* todo_flags_finish */ > > -}; > > - > > -class pass_constant_pool_broadcast : public rtl_opt_pass > > -{ > > -public: > > - pass_constant_pool_broadcast (gcc::context *ctxt) > > - : rtl_opt_pass (pass_data_constant_pool_broadcast, ctxt) > > - {} > > - > > - /* opt_pass methods: */ > > - virtual bool gate (function *) > > - { > > - /* Return false if rpad pass gate is true. > > - replace_constant_pool_with_broadcast is called > > - from both this pass and rpad pass. */ > > - return (TARGET_AVX512F && !remove_partial_avx_dependency_gate ()); > > - } > > - > > - virtual unsigned int execute (function *) > > - { > > - return constant_pool_broadcast (); > > - } > > -}; // class pass_cpb > > - > > -} // anon namespace > > - > > -rtl_opt_pass * > > -make_pass_constant_pool_broadcast (gcc::context *ctxt) > > -{ > > - return new pass_constant_pool_broadcast (ctxt); > > -} > > - > > /* This compares the priority of target features in function DECL1 > > and DECL2. It returns positive value if DECL1 is higher priority, > > negative value if DECL2 is higher priority and 0 if they are the > > diff --git a/gcc/config/i386/i386-passes.def > > b/gcc/config/i386/i386-passes.def > > index 44df00e94ac..29baf8acd0b 100644 > > --- a/gcc/config/i386/i386-passes.def > > +++ b/gcc/config/i386/i386-passes.def > > @@ -33,4 +33,3 @@ along with GCC; see the file COPYING3. If not see > > INSERT_PASS_BEFORE (pass_shorten_branches, 1, > > pass_insert_endbr_and_patchable_area); > > > > INSERT_PASS_AFTER (pass_combine, 1, pass_remove_partial_avx_dependency); > > - INSERT_PASS_AFTER (pass_combine, 1, pass_constant_pool_broadcast); > > diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h > > index 51376fcc454..07ac02aff69 100644 > > --- a/gcc/config/i386/i386-protos.h > > +++ b/gcc/config/i386/i386-protos.h > > @@ -395,4 +395,3 @@ extern rtl_opt_pass > > *make_pass_insert_endbr_and_patchable_area > > (gcc::context *); > > extern rtl_opt_pass *make_pass_remove_partial_avx_dependency > > (gcc::context *); > > -extern rtl_opt_pass *make_pass_constant_pool_broadcast (gcc::context *); > > diff --git a/gcc/testsuite/gcc.target/i386/fuse-caller-save-xmm.c > > b/gcc/testsuite/gcc.target/i386/fuse-caller-save-xmm.c > > index 4deff93c1e8..b0d3dc38a0c 100644 > > --- a/gcc/testsuite/gcc.target/i386/fuse-caller-save-xmm.c > > +++ b/gcc/testsuite/gcc.target/i386/fuse-caller-save-xmm.c > > @@ -6,7 +6,7 @@ typedef double v2df __attribute__((vector_size (16))); > > static v2df __attribute__((noinline)) > > bar (v2df a) > > { > > - return a + (v2df){ 3.0, 3.0 }; > > + return a + (v2df){ 3.0, 4.0 }; > > } > > > > v2df __attribute__((noinline)) > > -- > > 2.18.1 > > > > > -- > BR, > Hongtao -- H.J.