On Wed, Jul 14, 2021 at 8:38 PM H.J. Lu <hjl.to...@gmail.com> wrote: > > On Tue, Jul 13, 2021 at 9:35 PM Hongtao Liu <crazy...@gmail.com> wrote: > > > > On Wed, Jul 14, 2021 at 10:34 AM liuhongt <hongtao....@intel.com> wrote: > > > > > > By optimizing vector movement to broadcast in ix86_expand_vector_move > > > during pass_expand, pass_reload/LRA can automatically generate an avx512 > > > embedded broadcast, pass_cpb is not needed. > > > > > > Considering that in the absence of avx512f, broadcast from memory is > > > still slightly faster than loading the entire memory, so always enable > > > broadcast. > > > > > > benchmark: > > > https://gitlab.com/x86-benchmarks/microbenchmark/-/tree/vaddps/broadcast > > > > > > The performance diff > > > > > > strategy : cycles > > > memory : 1046611188 > > > memory : 1255420817 > > > memory : 1044720793 > > > memory : 1253414145 > > > average : 1097868397 > > > > > > broadcast : 1044430688 > > > broadcast : 1044477630 > > > broadcast : 1253554603 > > > broadcast : 1044561934 > > > average : 1096756213 > > > > > > But however broadcast has larger size. > > > > > > the size diff > > > > > > size broadcast.o > > > text data bss dec hex filename > > > 137 0 0 137 89 broadcast.o > > > > > > size memory.o > > > text data bss dec hex filename > > > 115 0 0 115 73 memory.o > > > > > > Bootstrapped and regtested on x86_64-linux-gnu{-m32,} > > > > > > gcc/ChangeLog: > > > > > > * config/i386/i386-expand.c > > > (ix86_broadcast_from_integer_constant): Rename to .. > > > (ix86_broadcast_from_constant): .. this, and extend it to > > > handle float mode. > > > (ix86_expand_vector_move): Extend to float mode. > > > * config/i386/i386-features.c > > > (replace_constant_pool_with_broadcast): Remove. > > > (remove_partial_avx_dependency_gate): Ditto. > > > (constant_pool_broadcast): Ditto. > > > (class pass_constant_pool_broadcast): Ditto. > > > (make_pass_constant_pool_broadcast): Ditto. > > > (remove_partial_avx_dependency): Adjust gate. > > > * config/i386/i386-passes.def: Remove > > > pass_constant_pool_broadcast. > > > * config/i386/i386-protos.h > > > (make_pass_constant_pool_broadcast): Remove. > > > > > > gcc/testsuite/ChangeLog: > > > > > > * gcc.target/i386/fuse-caller-save-xmm.c: Adjust testcase. > > > --- > > > gcc/config/i386/i386-expand.c | 29 +++- > > > gcc/config/i386/i386-features.c | 157 +----------------- > > > gcc/config/i386/i386-passes.def | 1 - > > > gcc/config/i386/i386-protos.h | 1 - > > > .../gcc.target/i386/fuse-caller-save-xmm.c | 2 +- > > > 5 files changed, 26 insertions(+), 164 deletions(-) > > > > > > diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c > > > index 69ea79e6123..ba870145acd 100644 > > > --- a/gcc/config/i386/i386-expand.c > > > +++ b/gcc/config/i386/i386-expand.c > > > @@ -453,8 +453,10 @@ ix86_expand_move (machine_mode mode, rtx operands[]) > > > emit_insn (gen_rtx_SET (op0, op1)); > > > } > > > > > > +/* OP is a memref of CONST_VECTOR, return scalar constant mem > > > + if CONST_VECTOR is a vec_duplicate, else return NULL. */ > > > static rtx > > > -ix86_broadcast_from_integer_constant (machine_mode mode, rtx op) > > > +ix86_broadcast_from_constant (machine_mode mode, rtx op) > > > { > > > int nunits = GET_MODE_NUNITS (mode); > > > if (nunits < 2) > > > @@ -462,7 +464,8 @@ ix86_broadcast_from_integer_constant (machine_mode > > > mode, rtx op) > > > > > > /* Don't use integer vector broadcast if we can't move from GPR to SSE > > > register directly. */ > > > - if (!TARGET_INTER_UNIT_MOVES_TO_VEC) > > > + if (!TARGET_INTER_UNIT_MOVES_TO_VEC > > > + && INTEGRAL_MODE_P (mode)) > > > return nullptr; > > > > > > /* Convert CONST_VECTOR to a non-standard SSE constant integer > > > @@ -470,12 +473,17 @@ ix86_broadcast_from_integer_constant (machine_mode > > > mode, rtx op) > > > if (!(TARGET_AVX2 > > > || (TARGET_AVX > > > && (GET_MODE_INNER (mode) == SImode > > > - || GET_MODE_INNER (mode) == DImode))) > > > + || GET_MODE_INNER (mode) == DImode)) > > > + || FLOAT_MODE_P (mode)) > > > || standard_sse_constant_p (op, mode)) > > > return nullptr; > > > > > > - /* Don't broadcast from a 64-bit integer constant in 32-bit mode. */ > > > - if (GET_MODE_INNER (mode) == DImode && !TARGET_64BIT) > > > + /* Don't broadcast from a 64-bit integer constant in 32-bit mode. > > > + We can still put 64-bit integer constant in memory when > > > + avx512 embed broadcast is available. */ > > > + if (GET_MODE_INNER (mode) == DImode && !TARGET_64BIT > > > + && (!TARGET_AVX512F > > > + || (GET_MODE_SIZE (mode) < 64 && !TARGET_AVX512VL))) > > > return nullptr; > > > > > > if (GET_MODE_INNER (mode) == TImode) > > > @@ -561,17 +569,20 @@ ix86_expand_vector_move (machine_mode mode, rtx > > > operands[]) > > > > > > if (can_create_pseudo_p () > > > && GET_MODE_SIZE (mode) >= 16 > > > - && GET_MODE_CLASS (mode) == MODE_VECTOR_INT > > > + && VECTOR_MODE_P (mode) > > > && (MEM_P (op1) > > > && SYMBOL_REF_P (XEXP (op1, 0)) > > > && CONSTANT_POOL_ADDRESS_P (XEXP (op1, 0)))) > > > { > > > - rtx first = ix86_broadcast_from_integer_constant (mode, op1); > > > + rtx first = ix86_broadcast_from_constant (mode, op1); > > > if (first != nullptr) > > > { > > > /* Broadcast to XMM/YMM/ZMM register from an integer > > > - constant. */ > > > - op1 = ix86_gen_scratch_sse_rtx (mode); > > > + constant or scalar mem. */ > > > + op1 = gen_reg_rtx (mode); > > I've changed ix86_gen_scratch_sse_rtx to gen_reg_rtx to let LRA/reload > > make the final decision for register usage, would that make sense? > > Hard registers are used for 2 purposes: > > 1. Prevent stack realignment when the original code doesn't use vector > registers, which is the same for memcpy and memset. > 2. Prevent combine to convert constant broadcast to load from constant > pool. > > Please add an argument to ix86_gen_scratch_sse_rtx to use hard registers > for the above 2 cases. You can get pseudo registers for your use cases. > Ok, Here's the patch i'm going to check in.
> > w/o that, hard register used here will prevent LRA/reload combine (set > > op1 (vec_duplicate: mem)) and (add (op1, op2)) to (add (op1, > > vec_duplicate: mem)) > > > > > + if (FLOAT_MODE_P (mode) > > > + || (!TARGET_64BIT && GET_MODE_INNER (mode) == DImode)) > > > + first = force_const_mem (GET_MODE_INNER (mode), first); > > > bool ok = ix86_expand_vector_init_duplicate (false, mode, > > > op1, first); > > > gcc_assert (ok); > > > diff --git a/gcc/config/i386/i386-features.c > > > b/gcc/config/i386/i386-features.c > > > index cbd430a2ecf..d9c6652d1c9 100644 > > > --- a/gcc/config/i386/i386-features.c > > > +++ b/gcc/config/i386/i386-features.c > > > @@ -2136,81 +2136,6 @@ make_pass_insert_endbr_and_patchable_area > > > (gcc::context *ctxt) > > > return new pass_insert_endbr_and_patchable_area (ctxt); > > > } > > > > > > -/* Replace all one-value const vector that are referenced by SYMBOL_REFs > > > in x > > > - with embedded broadcast. i.e.transform > > > - > > > - vpaddq .LC0(%rip), %zmm0, %zmm0 > > > - ret > > > - .LC0: > > > - .quad 3 > > > - .quad 3 > > > - .quad 3 > > > - .quad 3 > > > - .quad 3 > > > - .quad 3 > > > - .quad 3 > > > - .quad 3 > > > - > > > - to > > > - > > > - vpaddq .LC0(%rip){1to8}, %zmm0, %zmm0 > > > - ret > > > - .LC0: > > > - .quad 3 */ > > > -static void > > > -replace_constant_pool_with_broadcast (rtx_insn *insn) > > > -{ > > > - subrtx_ptr_iterator::array_type array; > > > - FOR_EACH_SUBRTX_PTR (iter, array, &PATTERN (insn), ALL) > > > - { > > > - rtx *loc = *iter; > > > - rtx x = *loc; > > > - rtx broadcast_mem, vec_dup, constant, first; > > > - machine_mode mode; > > > - > > > - /* Constant pool. */ > > > - if (!MEM_P (x) > > > - || !SYMBOL_REF_P (XEXP (x, 0)) > > > - || !CONSTANT_POOL_ADDRESS_P (XEXP (x, 0))) > > > - continue; > > > - > > > - /* Const vector. */ > > > - mode = GET_MODE (x); > > > - if (!VECTOR_MODE_P (mode)) > > > - return; > > > - constant = get_pool_constant (XEXP (x, 0)); > > > - if (GET_CODE (constant) != CONST_VECTOR) > > > - return; > > > - > > > - /* There could be some rtx like > > > - (mem/u/c:V16QI (symbol_ref/u:DI ("*.LC1"))) > > > - but with "*.LC1" refer to V2DI constant vector. */ > > > - if (GET_MODE (constant) != mode) > > > - { > > > - constant = simplify_subreg (mode, constant, GET_MODE > > > (constant), 0); > > > - if (constant == NULL_RTX || GET_CODE (constant) != CONST_VECTOR) > > > - return; > > > - } > > > - first = XVECEXP (constant, 0, 0); > > > - > > > - for (int i = 1; i < GET_MODE_NUNITS (mode); ++i) > > > - { > > > - rtx tmp = XVECEXP (constant, 0, i); > > > - /* Vector duplicate value. */ > > > - if (!rtx_equal_p (tmp, first)) > > > - return; > > > - } > > > - > > > - /* Replace with embedded broadcast. */ > > > - broadcast_mem = force_const_mem (GET_MODE_INNER (mode), first); > > > - vec_dup = gen_rtx_VEC_DUPLICATE (mode, broadcast_mem); > > > - validate_change (insn, loc, vec_dup, 0); > > > - > > > - /* At most 1 memory_operand in an insn. */ > > > - return; > > > - } > > > -} > > > - > > > /* At entry of the nearest common dominator for basic blocks with > > > conversions, generate a single > > > vxorps %xmmN, %xmmN, %xmmN > > > @@ -2249,10 +2174,6 @@ remove_partial_avx_dependency (void) > > > if (!NONDEBUG_INSN_P (insn)) > > > continue; > > > > > > - /* Handle AVX512 embedded broadcast here to save compile time. > > > */ > > > - if (TARGET_AVX512F) > > > - replace_constant_pool_with_broadcast (insn); > > > - > > > set = single_set (insn); > > > if (!set) > > > continue; > > > @@ -2384,16 +2305,6 @@ remove_partial_avx_dependency (void) > > > return 0; > > > } > > > > > > -static bool > > > -remove_partial_avx_dependency_gate () > > > -{ > > > - return (TARGET_AVX > > > - && TARGET_SSE_PARTIAL_REG_DEPENDENCY > > > - && TARGET_SSE_MATH > > > - && optimize > > > - && optimize_function_for_speed_p (cfun)); > > > -} > > > - > > > namespace { > > > > > > const pass_data pass_data_remove_partial_avx_dependency = > > > @@ -2419,7 +2330,11 @@ public: > > > /* opt_pass methods: */ > > > virtual bool gate (function *) > > > { > > > - return remove_partial_avx_dependency_gate (); > > > + return (TARGET_AVX > > > + && TARGET_SSE_PARTIAL_REG_DEPENDENCY > > > + && TARGET_SSE_MATH > > > + && optimize > > > + && optimize_function_for_speed_p (cfun)); > > > } > > > > > > virtual unsigned int execute (function *) > > > @@ -2436,68 +2351,6 @@ make_pass_remove_partial_avx_dependency > > > (gcc::context *ctxt) > > > return new pass_remove_partial_avx_dependency (ctxt); > > > } > > > > > > -/* For const vector having one duplicated value, there's no need to put > > > - whole vector in the constant pool when target supports embedded > > > broadcast. */ > > > -static unsigned int > > > -constant_pool_broadcast (void) > > > -{ > > > - timevar_push (TV_MACH_DEP); > > > - rtx_insn *insn; > > > - > > > - for (insn = get_insns (); insn; insn = NEXT_INSN (insn)) > > > - { > > > - if (INSN_P (insn)) > > > - replace_constant_pool_with_broadcast (insn); > > > - } > > > - timevar_pop (TV_MACH_DEP); > > > - return 0; > > > -} > > > - > > > -namespace { > > > - > > > -const pass_data pass_data_constant_pool_broadcast = > > > -{ > > > - RTL_PASS, /* type */ > > > - "cpb", /* name */ > > > - OPTGROUP_NONE, /* optinfo_flags */ > > > - TV_MACH_DEP, /* tv_id */ > > > - 0, /* properties_required */ > > > - 0, /* properties_provided */ > > > - 0, /* properties_destroyed */ > > > - 0, /* todo_flags_start */ > > > - TODO_df_finish, /* todo_flags_finish */ > > > -}; > > > - > > > -class pass_constant_pool_broadcast : public rtl_opt_pass > > > -{ > > > -public: > > > - pass_constant_pool_broadcast (gcc::context *ctxt) > > > - : rtl_opt_pass (pass_data_constant_pool_broadcast, ctxt) > > > - {} > > > - > > > - /* opt_pass methods: */ > > > - virtual bool gate (function *) > > > - { > > > - /* Return false if rpad pass gate is true. > > > - replace_constant_pool_with_broadcast is called > > > - from both this pass and rpad pass. */ > > > - return (TARGET_AVX512F && !remove_partial_avx_dependency_gate ()); > > > - } > > > - > > > - virtual unsigned int execute (function *) > > > - { > > > - return constant_pool_broadcast (); > > > - } > > > -}; // class pass_cpb > > > - > > > -} // anon namespace > > > - > > > -rtl_opt_pass * > > > -make_pass_constant_pool_broadcast (gcc::context *ctxt) > > > -{ > > > - return new pass_constant_pool_broadcast (ctxt); > > > -} > > > - > > > /* This compares the priority of target features in function DECL1 > > > and DECL2. It returns positive value if DECL1 is higher priority, > > > negative value if DECL2 is higher priority and 0 if they are the > > > diff --git a/gcc/config/i386/i386-passes.def > > > b/gcc/config/i386/i386-passes.def > > > index 44df00e94ac..29baf8acd0b 100644 > > > --- a/gcc/config/i386/i386-passes.def > > > +++ b/gcc/config/i386/i386-passes.def > > > @@ -33,4 +33,3 @@ along with GCC; see the file COPYING3. If not see > > > INSERT_PASS_BEFORE (pass_shorten_branches, 1, > > > pass_insert_endbr_and_patchable_area); > > > > > > INSERT_PASS_AFTER (pass_combine, 1, > > > pass_remove_partial_avx_dependency); > > > - INSERT_PASS_AFTER (pass_combine, 1, pass_constant_pool_broadcast); > > > diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h > > > index 51376fcc454..07ac02aff69 100644 > > > --- a/gcc/config/i386/i386-protos.h > > > +++ b/gcc/config/i386/i386-protos.h > > > @@ -395,4 +395,3 @@ extern rtl_opt_pass > > > *make_pass_insert_endbr_and_patchable_area > > > (gcc::context *); > > > extern rtl_opt_pass *make_pass_remove_partial_avx_dependency > > > (gcc::context *); > > > -extern rtl_opt_pass *make_pass_constant_pool_broadcast (gcc::context *); > > > diff --git a/gcc/testsuite/gcc.target/i386/fuse-caller-save-xmm.c > > > b/gcc/testsuite/gcc.target/i386/fuse-caller-save-xmm.c > > > index 4deff93c1e8..b0d3dc38a0c 100644 > > > --- a/gcc/testsuite/gcc.target/i386/fuse-caller-save-xmm.c > > > +++ b/gcc/testsuite/gcc.target/i386/fuse-caller-save-xmm.c > > > @@ -6,7 +6,7 @@ typedef double v2df __attribute__((vector_size (16))); > > > static v2df __attribute__((noinline)) > > > bar (v2df a) > > > { > > > - return a + (v2df){ 3.0, 3.0 }; > > > + return a + (v2df){ 3.0, 4.0 }; > > > } > > > > > > v2df __attribute__((noinline)) > > > -- > > > 2.18.1 > > > > > > > > > -- > > BR, > > Hongtao > > > > -- > H.J. -- BR, Hongtao
From c6cf0603b262bc41ec06f29f647214daf2a5965c Mon Sep 17 00:00:00 2001 From: liuhongt <hongtao....@intel.com> Date: Tue, 13 Jul 2021 18:22:03 +0800 Subject: [PATCH] Remove pass_cpb which is related to enable avx512 embedded broadcast from constant pool. By optimizing vector movement to broadcast in ix86_expand_vector_move during pass_expand, pass_reload/LRA can automatically generate an avx512 embedded broadcast, pass_cpb is not needed. Considering that in the absence of avx512f, broadcast from memory is still slightly faster than loading the entire memory, so always enable broadcast. benchmark: https://gitlab.com/x86-benchmarks/microbenchmark/-/tree/vaddps/broadcast The performance diff strategy : cycles memory : 1046611188 memory : 1255420817 memory : 1044720793 memory : 1253414145 average : 1097868397 broadcast : 1044430688 broadcast : 1044477630 broadcast : 1253554603 broadcast : 1044561934 average : 1096756213 But however broadcast has larger size. the size diff size broadcast.o text data bss dec hex filename 137 0 0 137 89 broadcast.o size memory.o text data bss dec hex filename 115 0 0 115 73 memory.o gcc/ChangeLog: * config/i386/i386-expand.c (ix86_broadcast_from_integer_constant): Rename to .. (ix86_broadcast_from_constant): .. this, and extend it to handle float mode. (ix86_expand_vector_move): Extend to float mode. * config/i386/i386-features.c (replace_constant_pool_with_broadcast): Remove. (remove_partial_avx_dependency_gate): Ditto. (constant_pool_broadcast): Ditto. (class pass_constant_pool_broadcast): Ditto. (make_pass_constant_pool_broadcast): Ditto. (remove_partial_avx_dependency): Adjust gate. * config/i386/i386-passes.def: Remove pass_constant_pool_broadcast. * config/i386/i386-protos.h (make_pass_constant_pool_broadcast): Remove. gcc/testsuite/ChangeLog: * gcc.target/i386/fuse-caller-save-xmm.c: Adjust testcase. --- gcc/config/i386/i386-expand.c | 36 +++- gcc/config/i386/i386-features.c | 157 +----------------- gcc/config/i386/i386-passes.def | 1 - gcc/config/i386/i386-protos.h | 1 - .../gcc.target/i386/fuse-caller-save-xmm.c | 2 +- 5 files changed, 34 insertions(+), 163 deletions(-) diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c index 69ea79e6123..896bd685b1f 100644 --- a/gcc/config/i386/i386-expand.c +++ b/gcc/config/i386/i386-expand.c @@ -453,8 +453,10 @@ ix86_expand_move (machine_mode mode, rtx operands[]) emit_insn (gen_rtx_SET (op0, op1)); } +/* OP is a memref of CONST_VECTOR, return scalar constant mem + if CONST_VECTOR is a vec_duplicate, else return NULL. */ static rtx -ix86_broadcast_from_integer_constant (machine_mode mode, rtx op) +ix86_broadcast_from_constant (machine_mode mode, rtx op) { int nunits = GET_MODE_NUNITS (mode); if (nunits < 2) @@ -462,7 +464,8 @@ ix86_broadcast_from_integer_constant (machine_mode mode, rtx op) /* Don't use integer vector broadcast if we can't move from GPR to SSE register directly. */ - if (!TARGET_INTER_UNIT_MOVES_TO_VEC) + if (!TARGET_INTER_UNIT_MOVES_TO_VEC + && INTEGRAL_MODE_P (mode)) return nullptr; /* Convert CONST_VECTOR to a non-standard SSE constant integer @@ -470,12 +473,17 @@ ix86_broadcast_from_integer_constant (machine_mode mode, rtx op) if (!(TARGET_AVX2 || (TARGET_AVX && (GET_MODE_INNER (mode) == SImode - || GET_MODE_INNER (mode) == DImode))) + || GET_MODE_INNER (mode) == DImode)) + || FLOAT_MODE_P (mode)) || standard_sse_constant_p (op, mode)) return nullptr; - /* Don't broadcast from a 64-bit integer constant in 32-bit mode. */ - if (GET_MODE_INNER (mode) == DImode && !TARGET_64BIT) + /* Don't broadcast from a 64-bit integer constant in 32-bit mode. + We can still put 64-bit integer constant in memory when + avx512 embed broadcast is available. */ + if (GET_MODE_INNER (mode) == DImode && !TARGET_64BIT + && (!TARGET_AVX512F + || (GET_MODE_SIZE (mode) < 64 && !TARGET_AVX512VL))) return nullptr; if (GET_MODE_INNER (mode) == TImode) @@ -561,17 +569,29 @@ ix86_expand_vector_move (machine_mode mode, rtx operands[]) if (can_create_pseudo_p () && GET_MODE_SIZE (mode) >= 16 - && GET_MODE_CLASS (mode) == MODE_VECTOR_INT + && VECTOR_MODE_P (mode) && (MEM_P (op1) && SYMBOL_REF_P (XEXP (op1, 0)) && CONSTANT_POOL_ADDRESS_P (XEXP (op1, 0)))) { - rtx first = ix86_broadcast_from_integer_constant (mode, op1); + rtx first = ix86_broadcast_from_constant (mode, op1); if (first != nullptr) { /* Broadcast to XMM/YMM/ZMM register from an integer - constant. */ + constant or scalar mem. */ + /* Hard registers are used for 2 purposes: + 1. Prevent stack realignment when the original code + doesn't use vector registers, which is the same for + memcpy and memset. + 2. Prevent combine to convert constant broadcast to + load from constant pool. */ op1 = ix86_gen_scratch_sse_rtx (mode); + if (FLOAT_MODE_P (mode) + || (!TARGET_64BIT && GET_MODE_INNER (mode) == DImode)) + { + first = force_const_mem (GET_MODE_INNER (mode), first); + op1 = gen_reg_rtx (mode); + } bool ok = ix86_expand_vector_init_duplicate (false, mode, op1, first); gcc_assert (ok); diff --git a/gcc/config/i386/i386-features.c b/gcc/config/i386/i386-features.c index cbd430a2ecf..d9c6652d1c9 100644 --- a/gcc/config/i386/i386-features.c +++ b/gcc/config/i386/i386-features.c @@ -2136,81 +2136,6 @@ make_pass_insert_endbr_and_patchable_area (gcc::context *ctxt) return new pass_insert_endbr_and_patchable_area (ctxt); } -/* Replace all one-value const vector that are referenced by SYMBOL_REFs in x - with embedded broadcast. i.e.transform - - vpaddq .LC0(%rip), %zmm0, %zmm0 - ret - .LC0: - .quad 3 - .quad 3 - .quad 3 - .quad 3 - .quad 3 - .quad 3 - .quad 3 - .quad 3 - - to - - vpaddq .LC0(%rip){1to8}, %zmm0, %zmm0 - ret - .LC0: - .quad 3 */ -static void -replace_constant_pool_with_broadcast (rtx_insn *insn) -{ - subrtx_ptr_iterator::array_type array; - FOR_EACH_SUBRTX_PTR (iter, array, &PATTERN (insn), ALL) - { - rtx *loc = *iter; - rtx x = *loc; - rtx broadcast_mem, vec_dup, constant, first; - machine_mode mode; - - /* Constant pool. */ - if (!MEM_P (x) - || !SYMBOL_REF_P (XEXP (x, 0)) - || !CONSTANT_POOL_ADDRESS_P (XEXP (x, 0))) - continue; - - /* Const vector. */ - mode = GET_MODE (x); - if (!VECTOR_MODE_P (mode)) - return; - constant = get_pool_constant (XEXP (x, 0)); - if (GET_CODE (constant) != CONST_VECTOR) - return; - - /* There could be some rtx like - (mem/u/c:V16QI (symbol_ref/u:DI ("*.LC1"))) - but with "*.LC1" refer to V2DI constant vector. */ - if (GET_MODE (constant) != mode) - { - constant = simplify_subreg (mode, constant, GET_MODE (constant), 0); - if (constant == NULL_RTX || GET_CODE (constant) != CONST_VECTOR) - return; - } - first = XVECEXP (constant, 0, 0); - - for (int i = 1; i < GET_MODE_NUNITS (mode); ++i) - { - rtx tmp = XVECEXP (constant, 0, i); - /* Vector duplicate value. */ - if (!rtx_equal_p (tmp, first)) - return; - } - - /* Replace with embedded broadcast. */ - broadcast_mem = force_const_mem (GET_MODE_INNER (mode), first); - vec_dup = gen_rtx_VEC_DUPLICATE (mode, broadcast_mem); - validate_change (insn, loc, vec_dup, 0); - - /* At most 1 memory_operand in an insn. */ - return; - } -} - /* At entry of the nearest common dominator for basic blocks with conversions, generate a single vxorps %xmmN, %xmmN, %xmmN @@ -2249,10 +2174,6 @@ remove_partial_avx_dependency (void) if (!NONDEBUG_INSN_P (insn)) continue; - /* Handle AVX512 embedded broadcast here to save compile time. */ - if (TARGET_AVX512F) - replace_constant_pool_with_broadcast (insn); - set = single_set (insn); if (!set) continue; @@ -2384,16 +2305,6 @@ remove_partial_avx_dependency (void) return 0; } -static bool -remove_partial_avx_dependency_gate () -{ - return (TARGET_AVX - && TARGET_SSE_PARTIAL_REG_DEPENDENCY - && TARGET_SSE_MATH - && optimize - && optimize_function_for_speed_p (cfun)); -} - namespace { const pass_data pass_data_remove_partial_avx_dependency = @@ -2419,7 +2330,11 @@ public: /* opt_pass methods: */ virtual bool gate (function *) { - return remove_partial_avx_dependency_gate (); + return (TARGET_AVX + && TARGET_SSE_PARTIAL_REG_DEPENDENCY + && TARGET_SSE_MATH + && optimize + && optimize_function_for_speed_p (cfun)); } virtual unsigned int execute (function *) @@ -2436,68 +2351,6 @@ make_pass_remove_partial_avx_dependency (gcc::context *ctxt) return new pass_remove_partial_avx_dependency (ctxt); } -/* For const vector having one duplicated value, there's no need to put - whole vector in the constant pool when target supports embedded broadcast. */ -static unsigned int -constant_pool_broadcast (void) -{ - timevar_push (TV_MACH_DEP); - rtx_insn *insn; - - for (insn = get_insns (); insn; insn = NEXT_INSN (insn)) - { - if (INSN_P (insn)) - replace_constant_pool_with_broadcast (insn); - } - timevar_pop (TV_MACH_DEP); - return 0; -} - -namespace { - -const pass_data pass_data_constant_pool_broadcast = -{ - RTL_PASS, /* type */ - "cpb", /* name */ - OPTGROUP_NONE, /* optinfo_flags */ - TV_MACH_DEP, /* tv_id */ - 0, /* properties_required */ - 0, /* properties_provided */ - 0, /* properties_destroyed */ - 0, /* todo_flags_start */ - TODO_df_finish, /* todo_flags_finish */ -}; - -class pass_constant_pool_broadcast : public rtl_opt_pass -{ -public: - pass_constant_pool_broadcast (gcc::context *ctxt) - : rtl_opt_pass (pass_data_constant_pool_broadcast, ctxt) - {} - - /* opt_pass methods: */ - virtual bool gate (function *) - { - /* Return false if rpad pass gate is true. - replace_constant_pool_with_broadcast is called - from both this pass and rpad pass. */ - return (TARGET_AVX512F && !remove_partial_avx_dependency_gate ()); - } - - virtual unsigned int execute (function *) - { - return constant_pool_broadcast (); - } -}; // class pass_cpb - -} // anon namespace - -rtl_opt_pass * -make_pass_constant_pool_broadcast (gcc::context *ctxt) -{ - return new pass_constant_pool_broadcast (ctxt); -} - /* This compares the priority of target features in function DECL1 and DECL2. It returns positive value if DECL1 is higher priority, negative value if DECL2 is higher priority and 0 if they are the diff --git a/gcc/config/i386/i386-passes.def b/gcc/config/i386/i386-passes.def index 44df00e94ac..29baf8acd0b 100644 --- a/gcc/config/i386/i386-passes.def +++ b/gcc/config/i386/i386-passes.def @@ -33,4 +33,3 @@ along with GCC; see the file COPYING3. If not see INSERT_PASS_BEFORE (pass_shorten_branches, 1, pass_insert_endbr_and_patchable_area); INSERT_PASS_AFTER (pass_combine, 1, pass_remove_partial_avx_dependency); - INSERT_PASS_AFTER (pass_combine, 1, pass_constant_pool_broadcast); diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h index 51376fcc454..07ac02aff69 100644 --- a/gcc/config/i386/i386-protos.h +++ b/gcc/config/i386/i386-protos.h @@ -395,4 +395,3 @@ extern rtl_opt_pass *make_pass_insert_endbr_and_patchable_area (gcc::context *); extern rtl_opt_pass *make_pass_remove_partial_avx_dependency (gcc::context *); -extern rtl_opt_pass *make_pass_constant_pool_broadcast (gcc::context *); diff --git a/gcc/testsuite/gcc.target/i386/fuse-caller-save-xmm.c b/gcc/testsuite/gcc.target/i386/fuse-caller-save-xmm.c index 4deff93c1e8..b0d3dc38a0c 100644 --- a/gcc/testsuite/gcc.target/i386/fuse-caller-save-xmm.c +++ b/gcc/testsuite/gcc.target/i386/fuse-caller-save-xmm.c @@ -6,7 +6,7 @@ typedef double v2df __attribute__((vector_size (16))); static v2df __attribute__((noinline)) bar (v2df a) { - return a + (v2df){ 3.0, 3.0 }; + return a + (v2df){ 3.0, 4.0 }; } v2df __attribute__((noinline)) -- 2.18.1