For vector init constructor: --- typedef float __v4sf __attribute__ ((__vector_size__ (16)));
__v4sf foo (__v4sf x, float f) { __v4sf y = { f, x[1], x[2], x[3] }; return y; } --- we can optimize vector init constructor with vector copy or permute followed by a single scalar insert: __v4sf D.1912; __v4sf D.1913; __v4sf D.1914; __v4sf y; x.0_1 = x; D.1912 = x.0_1; _2 = D.1912; D.1913 = _2; BIT_FIELD_REF <D.1913, 32, 0> = f; y = D.1913; D.1914 = y; return D.1914; instead of __v4sf D.1962; __v4sf y; _1 = BIT_FIELD_REF <x, 32, 32>; _2 = BIT_FIELD_REF <x, 32, 64>; _3 = BIT_FIELD_REF <x, 32, 96>; y = {f, _1, _2, _3}; D.1962 = y; return D.1962; gcc/ PR tree-optimization/88828 * gimplify.c (gimplify_init_constructor): Optimize vector init constructor with vector copy or permute followed by a single scalar insert. gcc/testsuite/ PR tree-optimization/88828 * gcc.target/i386/pr88828-1.c: New test. * gcc.target/i386/pr88828-2.c: Likewise. * gcc.target/i386/pr88828-3a.c: Likewise. * gcc.target/i386/pr88828-3b.c: Likewise. * gcc.target/i386/pr88828-4a.c: Likewise. * gcc.target/i386/pr88828-4b.c: Likewise. * gcc.target/i386/pr88828-5a.c: Likewise. * gcc.target/i386/pr88828-5b.c: Likewise. * gcc.target/i386/pr88828-6a.c: Likewise. * gcc.target/i386/pr88828-6b.c: Likewise. --- gcc/gimplify.c | 176 +++++++++++++++++++-- gcc/testsuite/gcc.target/i386/pr88828-1.c | 16 ++ gcc/testsuite/gcc.target/i386/pr88828-2.c | 17 ++ gcc/testsuite/gcc.target/i386/pr88828-3a.c | 16 ++ gcc/testsuite/gcc.target/i386/pr88828-3b.c | 18 +++ gcc/testsuite/gcc.target/i386/pr88828-4a.c | 17 ++ gcc/testsuite/gcc.target/i386/pr88828-4b.c | 20 +++ gcc/testsuite/gcc.target/i386/pr88828-5a.c | 16 ++ gcc/testsuite/gcc.target/i386/pr88828-5b.c | 18 +++ gcc/testsuite/gcc.target/i386/pr88828-6a.c | 17 ++ gcc/testsuite/gcc.target/i386/pr88828-6b.c | 19 +++ 11 files changed, 336 insertions(+), 14 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-1.c create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-2.c create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-3a.c create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-3b.c create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-4a.c create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-4b.c create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-5a.c create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-5b.c create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-6a.c create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-6b.c diff --git a/gcc/gimplify.c b/gcc/gimplify.c index 983635ba21f..893a4311f9e 100644 --- a/gcc/gimplify.c +++ b/gcc/gimplify.c @@ -5082,22 +5082,170 @@ gimplify_init_constructor (tree *expr_p, gimple_seq *pre_p, gimple_seq *post_p, TREE_CONSTANT (ctor) = 0; } - /* Vector types use CONSTRUCTOR all the way through gimple - compilation as a general initializer. */ - FOR_EACH_VEC_SAFE_ELT (elts, ix, ce) + tree rhs_vector = NULL; + /* The vector element to replace scalar elements, which + will be overridden by scalar insert. */ + tree vector_element = NULL; + /* The single scalar element. */ + tree scalar_element = NULL; + unsigned int scalar_idx = 0; + enum { unknown, copy, permute, init } operation = unknown; + bool insert = false; + + /* Check if we can generate vector copy or permute followed by + a single scalar insert. */ + if (TYPE_VECTOR_SUBPARTS (type).is_constant ()) { - enum gimplify_status tret; - tret = gimplify_expr (&ce->value, pre_p, post_p, is_gimple_val, - fb_rvalue); - if (tret == GS_ERROR) - ret = GS_ERROR; - else if (TREE_STATIC (ctor) - && !initializer_constant_valid_p (ce->value, - TREE_TYPE (ce->value))) - TREE_STATIC (ctor) = 0; + /* If all RHS vector elements come from the same vector, + we can use permute. If all RHS vector elements come + from the same vector in the same order, we can use + copy. */ + unsigned int nunits + = TYPE_VECTOR_SUBPARTS (type).to_constant (); + unsigned int nscalars = 0; + unsigned int nvectors = 0; + operation = unknown; + FOR_EACH_VEC_SAFE_ELT (elts, ix, ce) + if (TREE_CODE (ce->value) == ARRAY_REF + || TREE_CODE (ce->value) == ARRAY_RANGE_REF) + { + if (!vector_element) + vector_element = ce->value; + /* Get the vector index. */ + tree idx = TREE_OPERAND (ce->value, 1); + if (TREE_CODE (idx) == INTEGER_CST) + { + /* Get the RHS vector. */ + tree r = ce->value; + while (handled_component_p (r)) + r = TREE_OPERAND (r, 0); + if (type == TREE_TYPE (r)) + { + /* The RHS vector has the same type as + LHS. */ + if (rhs_vector == NULL) + rhs_vector = r; + + /* Check if all RHS vector elements come + fome the same vector. */ + if (rhs_vector == r) + { + nvectors++; + if (TREE_INT_CST_LOW (idx) == ix + && (operation == unknown + || operation == copy)) + operation = copy; + else + operation = permute; + continue; + } + } + } + + /* Otherwise, use vector init. */ + break; + } + else if (TREE_CODE (TYPE_SIZE (TREE_TYPE (ce->value))) + == INTEGER_CST) + { + /* Only allow one single scalar insert. */ + if (nscalars != 0) + break; + nscalars = 1; + insert = true; + scalar_idx = ix; + scalar_element = ce->value; + } + + /* Allow a single scalar insert with vector copy or + vector permute. Vector copy without insert is OK. */ + if (nunits != (nscalars + nvectors) + || (nscalars == 0 && operation != copy)) + operation = unknown; + } + + if (operation == unknown) + { + /* Default to the regular vector init constructor. */ + operation = init; + insert = false; + } + + if (operation == copy) + { + /* Generate a vector copy. */ + tree var = create_tmp_var (type); + if (gimplify_expr (&rhs_vector, pre_p, post_p, + is_gimple_val, fb_rvalue) == GS_ERROR) + { + ret = GS_ERROR; + break; + } + gassign *init = gimple_build_assign (var, rhs_vector); + gimple_seq_add_stmt (pre_p, init); + if (gimplify_expr (&var, pre_p, post_p, is_gimple_val, + fb_rvalue) == GS_ERROR) + { + ret = GS_ERROR; + break; + } + /* Replace RHS with the vector copy. */ + if (!is_gimple_reg (TREE_OPERAND (*expr_p, 0))) + TREE_OPERAND (*expr_p, 1) = get_formal_tmp_var (var, pre_p); + else + TREE_OPERAND (*expr_p, 1) = var; + } + else + { + /* Prepare for vector permute by replacing the scalar + element with the vector one. */ + if (operation == permute) + (elts->address())[scalar_idx].value = vector_element; + + /* Vector types use CONSTRUCTOR all the way through gimple + compilation as a general initializer. */ + FOR_EACH_VEC_SAFE_ELT (elts, ix, ce) + { + enum gimplify_status tret; + tret = gimplify_expr (&ce->value, pre_p, post_p, + is_gimple_val, + fb_rvalue); + if (tret == GS_ERROR) + ret = GS_ERROR; + else if (TREE_STATIC (ctor) + && !initializer_constant_valid_p (ce->value, + TREE_TYPE (ce->value))) + TREE_STATIC (ctor) = 0; + } + if (!is_gimple_reg (TREE_OPERAND (*expr_p, 0))) + TREE_OPERAND (*expr_p, 1) = get_formal_tmp_var (ctor, pre_p); + } + + if (insert) + { + /* Generate a single scalar insert after vector copy or + permute. */ + tree rhs = TREE_OPERAND (*expr_p, 1); + tree var = create_tmp_var (type); + gassign *init = gimple_build_assign (var, rhs); + gimple_seq_add_stmt (pre_p, init); + if (gimplify_expr (&scalar_element, pre_p, post_p, + is_gimple_val, fb_rvalue) == GS_ERROR) + { + ret = GS_ERROR; + break; + } + tree scalar_type = TREE_TYPE (scalar_element); + tree scalar_size = TYPE_SIZE (scalar_type); + tree bitpos = bitsize_int (scalar_idx + * TREE_INT_CST_LOW (scalar_size)); + tree ref = build3_loc (EXPR_LOCATION (rhs), BIT_FIELD_REF, + scalar_type, var, scalar_size, + bitpos); + init = gimple_build_assign (ref, scalar_element); + gimplify_seq_add_stmt (pre_p, init); + TREE_OPERAND (*expr_p, 1) = var; } - if (!is_gimple_reg (TREE_OPERAND (*expr_p, 0))) - TREE_OPERAND (*expr_p, 1) = get_formal_tmp_var (ctor, pre_p); } break; diff --git a/gcc/testsuite/gcc.target/i386/pr88828-1.c b/gcc/testsuite/gcc.target/i386/pr88828-1.c new file mode 100644 index 00000000000..4ef1feab389 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr88828-1.c @@ -0,0 +1,16 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -msse -mno-sse4" } */ +/* { dg-final { scan-assembler "movss" } } */ +/* { dg-final { scan-assembler-not "movaps" } } */ +/* { dg-final { scan-assembler-not "movlhps" } } */ +/* { dg-final { scan-assembler-not "unpcklps" } } */ +/* { dg-final { scan-assembler-not "shufps" } } */ + +typedef float __v4sf __attribute__ ((__vector_size__ (16))); + +__v4sf +foo (__v4sf x, float f) +{ + __v4sf y = { f, x[1], x[2], x[3] }; + return y; +} diff --git a/gcc/testsuite/gcc.target/i386/pr88828-2.c b/gcc/testsuite/gcc.target/i386/pr88828-2.c new file mode 100644 index 00000000000..6dc482b6f4b --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr88828-2.c @@ -0,0 +1,17 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -msse -mno-sse4" } */ +/* { dg-final { scan-assembler "movss" } } */ +/* { dg-final { scan-assembler-not "movaps" } } */ +/* { dg-final { scan-assembler-not "movlhps" } } */ +/* { dg-final { scan-assembler-not "unpcklps" } } */ +/* { dg-final { scan-assembler-not "shufps" } } */ + +typedef float __v4sf __attribute__ ((__vector_size__ (16))); + +__v4sf +foo (__v4sf x, float f) +{ + __v4sf y = x; + y[0] = f; + return y; +} diff --git a/gcc/testsuite/gcc.target/i386/pr88828-3a.c b/gcc/testsuite/gcc.target/i386/pr88828-3a.c new file mode 100644 index 00000000000..97eb8e7162a --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr88828-3a.c @@ -0,0 +1,16 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -msse -mno-sse4" } */ +/* { dg-final { scan-assembler "movss" } } */ +/* { dg-final { scan-assembler-times "shufps" 1 } } */ +/* { dg-final { scan-assembler-not "movaps" } } */ +/* { dg-final { scan-assembler-not "movlhps" } } */ +/* { dg-final { scan-assembler-not "unpcklps" } } */ + +typedef float __v4sf __attribute__ ((__vector_size__ (16))); + +__v4sf +foo (__v4sf x, float f) +{ + __v4sf y = { f, x[0], x[2], x[3] }; + return y; +} diff --git a/gcc/testsuite/gcc.target/i386/pr88828-3b.c b/gcc/testsuite/gcc.target/i386/pr88828-3b.c new file mode 100644 index 00000000000..ab2ba730716 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr88828-3b.c @@ -0,0 +1,18 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mavx" } */ +/* { dg-final { scan-assembler-times "vpermilps" 1 } } */ +/* { dg-final { scan-assembler-times "vmovss" 1 { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-times "vpinsrd" 1 { target ia32 } } } */ +/* { dg-final { scan-assembler-not "vmovss" { target ia32 } } } */ +/* { dg-final { scan-assembler-not "vmovaps" } } */ +/* { dg-final { scan-assembler-not "vmovlhps" } } */ +/* { dg-final { scan-assembler-not "vunpcklps" } } */ + +typedef float __v4sf __attribute__ ((__vector_size__ (16))); + +__v4sf +foo (__v4sf x, float f) +{ + __v4sf y = { f, x[0], x[2], x[3] }; + return y; +} diff --git a/gcc/testsuite/gcc.target/i386/pr88828-4a.c b/gcc/testsuite/gcc.target/i386/pr88828-4a.c new file mode 100644 index 00000000000..a54689be701 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr88828-4a.c @@ -0,0 +1,17 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -msse -mno-sse4" } */ +/* { dg-final { scan-assembler "movss" } } */ +/* { dg-final { scan-assembler-times "shufps" 1 } } */ +/* { dg-final { scan-assembler-not "movaps" } } */ +/* { dg-final { scan-assembler-not "movlhps" } } */ +/* { dg-final { scan-assembler-not "unpcklps" } } */ + +typedef float __v4sf __attribute__ ((__vector_size__ (16))); + +__v4sf +foo (__v4sf x, float f) +{ + __v4sf y = { x[0], x[2], x[3], x[1] }; + y[0] = f; + return y; +} diff --git a/gcc/testsuite/gcc.target/i386/pr88828-4b.c b/gcc/testsuite/gcc.target/i386/pr88828-4b.c new file mode 100644 index 00000000000..0c3a1024d93 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr88828-4b.c @@ -0,0 +1,20 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mavx" } */ +/* { dg-final { scan-assembler-times "vpermilps" 1 } } */ +/* { dg-final { scan-assembler-times "vmovss" 1 { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-times "vpinsrd" 1 { target ia32 } } } */ +/* { dg-final { scan-assembler-not "vmovss" { target ia32 } } } */ +/* { dg-final { scan-assembler-not "vshufps" } } */ +/* { dg-final { scan-assembler-not "vmovaps" } } */ +/* { dg-final { scan-assembler-not "vmovlhps" } } */ +/* { dg-final { scan-assembler-not "vunpcklps" } } */ + +typedef float __v4sf __attribute__ ((__vector_size__ (16))); + +__v4sf +foo (__v4sf x, float f) +{ + __v4sf y = { x[0], x[2], x[3], x[1] }; + y[0] = f; + return y; +} diff --git a/gcc/testsuite/gcc.target/i386/pr88828-5a.c b/gcc/testsuite/gcc.target/i386/pr88828-5a.c new file mode 100644 index 00000000000..534808d3cd1 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr88828-5a.c @@ -0,0 +1,16 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -msse -mno-sse4" } */ +/* { dg-final { scan-assembler "movss" } } */ +/* { dg-final { scan-assembler-times "shufps" 2 } } */ +/* { dg-final { scan-assembler-times "movaps" 1 } } */ +/* { dg-final { scan-assembler-not "movlhps" } } */ +/* { dg-final { scan-assembler-not "unpcklps" } } */ + +typedef float __v4sf __attribute__ ((__vector_size__ (16))); + +__v4sf +foo (__v4sf x, float f) +{ + __v4sf y = { x[0], x[2], x[3], f }; + return y; +} diff --git a/gcc/testsuite/gcc.target/i386/pr88828-5b.c b/gcc/testsuite/gcc.target/i386/pr88828-5b.c new file mode 100644 index 00000000000..aebea790979 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr88828-5b.c @@ -0,0 +1,18 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mavx" } */ +/* { dg-final { scan-assembler-times "vpermilps" 1 } } */ +/* { dg-final { scan-assembler-times "vinsertps" 1 } } */ +/* { dg-final { scan-assembler-not "vmovss" } } */ +/* { dg-final { scan-assembler-not "vshufps" } } */ +/* { dg-final { scan-assembler-not "vmovaps" } } */ +/* { dg-final { scan-assembler-not "vmovlhps" } } */ +/* { dg-final { scan-assembler-not "vunpcklps" } } */ + +typedef float __v4sf __attribute__ ((__vector_size__ (16))); + +__v4sf +foo (__v4sf x, float f) +{ + __v4sf y = { x[0], x[2], x[3], f }; + return y; +} diff --git a/gcc/testsuite/gcc.target/i386/pr88828-6a.c b/gcc/testsuite/gcc.target/i386/pr88828-6a.c new file mode 100644 index 00000000000..d43a36d9137 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr88828-6a.c @@ -0,0 +1,17 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -msse -mno-sse4" } */ +/* { dg-final { scan-assembler "movss" } } */ +/* { dg-final { scan-assembler-times "shufps" 2 } } */ +/* { dg-final { scan-assembler-times "movaps" 1 } } */ +/* { dg-final { scan-assembler-not "movlhps" } } */ +/* { dg-final { scan-assembler-not "unpcklps" } } */ + +typedef float __v4sf __attribute__ ((__vector_size__ (16))); + +__v4sf +foo (__v4sf x, float f) +{ + __v4sf y = { x[0], x[2], x[3], x[0] }; + y[3] = f; + return y; +} diff --git a/gcc/testsuite/gcc.target/i386/pr88828-6b.c b/gcc/testsuite/gcc.target/i386/pr88828-6b.c new file mode 100644 index 00000000000..6856fe6500e --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr88828-6b.c @@ -0,0 +1,19 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mavx" } */ +/* { dg-final { scan-assembler-times "vpermilps" 1 } } */ +/* { dg-final { scan-assembler-times "vinsertps" 1 } } */ +/* { dg-final { scan-assembler-not "vshufps" } } */ +/* { dg-final { scan-assembler-not "vmovss" } } */ +/* { dg-final { scan-assembler-not "vmovaps" } } */ +/* { dg-final { scan-assembler-not "vmovlhps" } } */ +/* { dg-final { scan-assembler-not "vunpcklps" } } */ + +typedef float __v4sf __attribute__ ((__vector_size__ (16))); + +__v4sf +foo (__v4sf x, float f) +{ + __v4sf y = { x[0], x[2], x[3], x[0] }; + y[3] = f; + return y; +} -- 2.20.1