On Mon, Aug 23, 2021 at 03:23:26PM +0800, Hongtao Liu wrote: > On Sun, Aug 22, 2021 at 8:54 PM H.J. Lu via Gcc-patches > <gcc-patches@gcc.gnu.org> wrote: > > > > In vetor move pattern, replace nonimmediate_or_sse_const_operand with > > nonimmediate_or_sse_const_vector_operand to allow vector load from > > non-uniform CONST_VECTOR. Non-uniform CONST_VECTOR is enabled only in > > the combine pass since other RTL optimizers work better with constant > > pool. > > > > gcc/ > > > > PR target/43147 > > * config/i386/i386-expand.c (ix86_constant_broadcast): New > > function. Extracted from ix86_broadcast_from_constant. > > (ix86_broadcast_from_constant): Call ix86_constant_broadcast. > > (non_uniform_const_vector_p): New function. > > * config/i386/i386-protos.h (non_uniform_const_vector_p): New > > prototype. > > * config/i386/predicates.md > > (nonimmediate_or_sse_const_vector_operand): New predicate. > > * config/i386/sse.md (mov<mode>_internal): Replace > > nonimmediate_or_sse_const_operand with > > nonimmediate_or_sse_const_vector_operand. > > > > gcc/testsuite/ > > > > PR target/43147 > > * gcc.target/i386/pr43147.c: New test. > > --- > > gcc/config/i386/i386-expand.c | 74 ++++++++++++++----------- > > gcc/config/i386/i386-protos.h | 1 + > > gcc/config/i386/predicates.md | 7 +++ > > gcc/config/i386/sse.md | 2 +- > > gcc/testsuite/gcc.target/i386/pr43147.c | 15 +++++ > > 5 files changed, 67 insertions(+), 32 deletions(-) > > create mode 100644 gcc/testsuite/gcc.target/i386/pr43147.c > > > > diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c > > index 9bf13dbfa92..1d8f3110310 100644 > > --- a/gcc/config/i386/i386-expand.c > > +++ b/gcc/config/i386/i386-expand.c > > @@ -453,31 +453,12 @@ ix86_expand_move (machine_mode mode, rtx operands[]) > > emit_insn (gen_rtx_SET (op0, op1)); > > } > > > > -/* OP is a memref of CONST_VECTOR, return scalar constant mem > > - if CONST_VECTOR is a vec_duplicate, else return NULL. */ > > +/* CONSTANT is a CONST_VECTOR, return scalar constant if CONST_VECTOR is > > + a vec_duplicate, else return nullptr. */ > > + > > static rtx > > -ix86_broadcast_from_constant (machine_mode mode, rtx op) > > +ix86_constant_broadcast (rtx constant, machine_mode mode) > > { > > - int nunits = GET_MODE_NUNITS (mode); > > - if (nunits < 2) > > - return nullptr; > > - > > - /* Don't use integer vector broadcast if we can't move from GPR to SSE > > - register directly. */ > > - if (!TARGET_INTER_UNIT_MOVES_TO_VEC > > - && INTEGRAL_MODE_P (mode)) > > - return nullptr; > > - > > - /* Convert CONST_VECTOR to a non-standard SSE constant integer > > - broadcast only if vector broadcast is available. */ > > - if (!(TARGET_AVX2 > > - || (TARGET_AVX > > - && (GET_MODE_INNER (mode) == SImode > > - || GET_MODE_INNER (mode) == DImode)) > > - || FLOAT_MODE_P (mode)) > > - || standard_sse_constant_p (op, mode)) > > - return nullptr; > > - > This part is dropped in the new ix86_broadcast_from_constant,why?
Restored. > > /* Don't broadcast from a 64-bit integer constant in 32-bit mode. > > We can still put 64-bit integer constant in memory when > > avx512 embed broadcast is available. */ > > @@ -486,13 +467,6 @@ ix86_broadcast_from_constant (machine_mode mode, rtx > > op) > > || (GET_MODE_SIZE (mode) < 64 && !TARGET_AVX512VL))) > > return nullptr; > > > > - if (GET_MODE_INNER (mode) == TImode) > > - return nullptr; > > - > > - rtx constant = get_pool_constant (XEXP (op, 0)); > > - if (GET_CODE (constant) != CONST_VECTOR) > > - return nullptr; > > - > > /* There could be some rtx like > > (mem/u/c:V16QI (symbol_ref/u:DI ("*.LC1"))) > > but with "*.LC1" refer to V2DI constant vector. */ > > @@ -506,7 +480,7 @@ ix86_broadcast_from_constant (machine_mode mode, rtx op) > > > > rtx first = XVECEXP (constant, 0, 0); > > > > - for (int i = 1; i < nunits; ++i) > > + for (int i = 1; i < GET_MODE_NUNITS (mode); ++i) > > { > > rtx tmp = XVECEXP (constant, 0, i); > > /* Vector duplicate value. */ > > @@ -517,6 +491,44 @@ ix86_broadcast_from_constant (machine_mode mode, rtx > > op) > > return first; > > } > > > > +/* OP is a memref of CONST_VECTOR, return scalar constant mem > > + if CONST_VECTOR is a vec_duplicate, else return NULL. */ > > +static rtx > > +ix86_broadcast_from_constant (machine_mode mode, rtx op) > > +{ > > + int nunits = GET_MODE_NUNITS (mode); > > + if (nunits < 2) > > + return nullptr; > > + > > + /* Don't use integer vector broadcast if we can't move from GPR to SSE > > + register directly. */ > > + if (!TARGET_INTER_UNIT_MOVES_TO_VEC > > + && INTEGRAL_MODE_P (mode)) > > + return nullptr; > > + > > + if (GET_MODE_INNER (mode) == TImode) > > + return nullptr; > > + > > + rtx constant = get_pool_constant (XEXP (op, 0)); > > + if (GET_CODE (constant) != CONST_VECTOR) > > + return nullptr; > > + > > + return ix86_constant_broadcast (constant, mode); > > +} > > + > > +/* Return true if OP is a non-uniform CONST_VECTOR. */ > > + > Drop empty line. Removed. > > +bool > > +non_uniform_const_vector_p (rtx op, machine_mode mode) > > +{ > > + /* Allow non-uniform CONST_VECTOR only in the combine pass since other > > + RTL optimizers work better with constant pool. */ > > + return (current_pass > > + && current_pass->tv_id == TV_COMBINE > > + && GET_CODE (op) == CONST_VECTOR > > + && ix86_constant_broadcast (op, mode) == nullptr); > > +} > I'm not sure it's a good idea to add a specific pass condition to the > backend's predicate. > If it's for combine, should we add define_split alternatively? I tried. RTL optimizers won't optimize non-uniform CONST_VECTOR which leads extra moves: FAIL: gcc.target/i386/fnabs.c scan-assembler-times \tv?orp[sd][ \t] 2 FAIL: gcc.target/i386/fuse-caller-save-xmm.c scan-assembler-times addpd\\t\\.?[Ll]C0.*, %xmm0 1 FAIL: gcc.target/i386/fuse-caller-save-xmm.c scan-assembler-times movapd\t%xmm0, %xmm1 1 Only all bits 0/1 CONST_VECTOR is allowed as an operand for load and all other CONST_VECTORs are placed in constant pool. My patch allows non-uniform CONST_VECTOR in the combine pass so that it can generate load from non-uniform CONST_VECTOR. Since non-uniform CONST_VECTOR isn't allowed in other RTL passes, it will be put in constant pool. H.J. --- In vetor move pattern, replace nonimmediate_or_sse_const_operand with nonimmediate_or_sse_const_vector_operand to allow vector load from non-uniform CONST_VECTOR. Non-uniform CONST_VECTOR is enabled only in the combine pass since other RTL optimizers and IRA/LRA work better with constant pool. gcc/ PR target/43147 * config/i386/i386-expand.c (ix86_constant_broadcast): New function. Extracted from ix86_broadcast_from_constant. (ix86_broadcast_from_constant): Call ix86_constant_broadcast. (non_uniform_const_vector_p): New function. * config/i386/i386-protos.h (non_uniform_const_vector_p): New prototype. * config/i386/predicates.md (nonimmediate_or_sse_const_vector_operand): New predicate. * config/i386/sse.md (mov<mode>_internal): Replace nonimmediate_or_sse_const_operand with nonimmediate_or_sse_const_vector_operand. gcc/testsuite/ PR target/43147 * gcc.target/i386/pr43147.c: New test. --- gcc/config/i386/i386-expand.c | 79 ++++++++++++++++--------- gcc/config/i386/i386-protos.h | 1 + gcc/config/i386/predicates.md | 7 +++ gcc/config/i386/sse.md | 2 +- gcc/testsuite/gcc.target/i386/pr43147.c | 15 +++++ 5 files changed, 74 insertions(+), 30 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/pr43147.c diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c index 9bf13dbfa92..599f66582d5 100644 --- a/gcc/config/i386/i386-expand.c +++ b/gcc/config/i386/i386-expand.c @@ -453,6 +453,44 @@ ix86_expand_move (machine_mode mode, rtx operands[]) emit_insn (gen_rtx_SET (op0, op1)); } +/* CONSTANT is a CONST_VECTOR, return scalar constant if CONST_VECTOR is + a vec_duplicate, else return nullptr. */ + +static rtx +ix86_constant_broadcast (rtx constant, machine_mode mode) +{ + /* Don't broadcast from a 64-bit integer constant in 32-bit mode. + We can still put 64-bit integer constant in memory when + avx512 embed broadcast is available. */ + if (GET_MODE_INNER (mode) == DImode && !TARGET_64BIT + && (!TARGET_AVX512F + || (GET_MODE_SIZE (mode) < 64 && !TARGET_AVX512VL))) + return nullptr; + + /* There could be some rtx like + (mem/u/c:V16QI (symbol_ref/u:DI ("*.LC1"))) + but with "*.LC1" refer to V2DI constant vector. */ + if (GET_MODE (constant) != mode) + { + constant = simplify_subreg (mode, constant, GET_MODE (constant), + 0); + if (constant == nullptr || GET_CODE (constant) != CONST_VECTOR) + return nullptr; + } + + rtx first = XVECEXP (constant, 0, 0); + + for (int i = 1; i < GET_MODE_NUNITS (mode); ++i) + { + rtx tmp = XVECEXP (constant, 0, i); + /* Vector duplicate value. */ + if (!rtx_equal_p (tmp, first)) + return nullptr; + } + + return first; +} + /* OP is a memref of CONST_VECTOR, return scalar constant mem if CONST_VECTOR is a vec_duplicate, else return NULL. */ static rtx @@ -478,14 +516,6 @@ ix86_broadcast_from_constant (machine_mode mode, rtx op) || standard_sse_constant_p (op, mode)) return nullptr; - /* Don't broadcast from a 64-bit integer constant in 32-bit mode. - We can still put 64-bit integer constant in memory when - avx512 embed broadcast is available. */ - if (GET_MODE_INNER (mode) == DImode && !TARGET_64BIT - && (!TARGET_AVX512F - || (GET_MODE_SIZE (mode) < 64 && !TARGET_AVX512VL))) - return nullptr; - if (GET_MODE_INNER (mode) == TImode) return nullptr; @@ -493,28 +523,19 @@ ix86_broadcast_from_constant (machine_mode mode, rtx op) if (GET_CODE (constant) != CONST_VECTOR) return nullptr; - /* There could be some rtx like - (mem/u/c:V16QI (symbol_ref/u:DI ("*.LC1"))) - but with "*.LC1" refer to V2DI constant vector. */ - if (GET_MODE (constant) != mode) - { - constant = simplify_subreg (mode, constant, GET_MODE (constant), - 0); - if (constant == nullptr || GET_CODE (constant) != CONST_VECTOR) - return nullptr; - } - - rtx first = XVECEXP (constant, 0, 0); - - for (int i = 1; i < nunits; ++i) - { - rtx tmp = XVECEXP (constant, 0, i); - /* Vector duplicate value. */ - if (!rtx_equal_p (tmp, first)) - return nullptr; - } + return ix86_constant_broadcast (constant, mode); +} - return first; +/* Return true if OP is a non-uniform CONST_VECTOR. */ +bool +non_uniform_const_vector_p (rtx op, machine_mode mode) +{ + /* Allow non-uniform CONST_VECTOR only in the combine pass since other + RTL optimizers and IRA/LRA work better with constant pool. */ + return (current_pass + && current_pass->tv_id == TV_COMBINE + && GET_CODE (op) == CONST_VECTOR + && ix86_constant_broadcast (op, mode) == nullptr); } void diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h index 2fd13074c81..91d4c7cefb8 100644 --- a/gcc/config/i386/i386-protos.h +++ b/gcc/config/i386/i386-protos.h @@ -58,6 +58,7 @@ extern int standard_80387_constant_p (rtx); extern const char *standard_80387_constant_opcode (rtx); extern rtx standard_80387_constant_rtx (int); extern int standard_sse_constant_p (rtx, machine_mode); +extern bool non_uniform_const_vector_p (rtx, machine_mode); extern const char *standard_sse_constant_opcode (rtx_insn *, rtx *); extern bool ix86_standard_x87sse_constant_load_p (const rtx_insn *, rtx); extern bool ix86_pre_reload_split (void); diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md index 9321f332ef9..435b22855c8 100644 --- a/gcc/config/i386/predicates.md +++ b/gcc/config/i386/predicates.md @@ -1176,6 +1176,13 @@ (define_predicate "nonimmediate_or_sse_const_operand" (ior (match_operand 0 "nonimmediate_operand") (match_test "standard_sse_constant_p (op, mode)"))) +;; Return true when OP is nonimmediate, standard SSE constant, or +;; non-uniform CONST_VECTOR. +(define_predicate "nonimmediate_or_sse_const_vector_operand" + (ior (match_operand 0 "nonimmediate_operand") + (match_test "standard_sse_constant_p (op, mode)") + (match_test "non_uniform_const_vector_p (op, mode)"))) + ;; Return true if OP is a register or a zero. (define_predicate "reg_or_0_operand" (ior (match_operand 0 "register_operand") diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 95f95823ea3..a7919eca6c3 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -1075,7 +1075,7 @@ (define_expand "mov<mode>" (define_insn "mov<mode>_internal" [(set (match_operand:VMOVE 0 "nonimmediate_operand" "=v,v ,v ,m") - (match_operand:VMOVE 1 "nonimmediate_or_sse_const_operand" + (match_operand:VMOVE 1 "nonimmediate_or_sse_const_vector_operand" " C,<sseconstm1>,vm,v"))] "TARGET_SSE && (register_operand (operands[0], <MODE>mode) diff --git a/gcc/testsuite/gcc.target/i386/pr43147.c b/gcc/testsuite/gcc.target/i386/pr43147.c new file mode 100644 index 00000000000..3c30f917c06 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr43147.c @@ -0,0 +1,15 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -msse2" } */ +/* { dg-final { scan-assembler "movaps" } } */ +/* { dg-final { scan-assembler-not "shufps" } } */ + +#include <x86intrin.h> + +__m128 +foo (void) +{ + __m128 m = _mm_set_ps(1.0f, 2.0f, 3.0f, 4.0f); + m = _mm_shuffle_ps(m, m, 0xC9); + m = _mm_shuffle_ps(m, m, 0x2D); + return m; +} -- 2.31.1