) ,On Sun, Mar 3, 2019 at 6:32 AM H.J. Lu <hjl.to...@gmail.com> wrote: > > For vector init constructor: > > --- > typedef float __v4sf __attribute__ ((__vector_size__ (16))); > > __v4sf > foo (__v4sf x, float f) > { > __v4sf y = { f, x[1], x[2], x[3] }; > return y; > } > --- > > we can optimize vector init constructor with vector copy or permute > followed by a single scalar insert: > > __v4sf D.1912; > __v4sf D.1913; > __v4sf D.1914; > __v4sf y; > > x.0_1 = x; > D.1912 = x.0_1; > _2 = D.1912; > D.1913 = _2; > BIT_FIELD_REF <D.1913, 32, 0> = f; > y = D.1913; > D.1914 = y; > return D.1914; > > instead of > > __v4sf D.1962; > __v4sf y; > > _1 = BIT_FIELD_REF <x, 32, 32>; > _2 = BIT_FIELD_REF <x, 32, 64>; > _3 = BIT_FIELD_REF <x, 32, 96>; > y = {f, _1, _2, _3}; > D.1962 = y; > return D.1962; > > gcc/ > > PR tree-optimization/88828 > * gimplify.c (gimplify_init_constructor): Optimize vector init > constructor with vector copy or permute followed by a single > scalar insert.
Doing this here does not catch things like: typedef float __v4sf __attribute__ ((__vector_size__ (16))); __v4sf vector_init (float f0,float f1, float f2,float f3) { __v4sf y = { f, x[1], x[2], x[3] }; return y; } __v4sf foo (__v4sf x, float f) { return vector_init (f, x[1], x[2], x[3]) ; } > > gcc/testsuite/ > > PR tree-optimization/88828 > * gcc.target/i386/pr88828-1.c: New test. > * gcc.target/i386/pr88828-2.c: Likewise. > * gcc.target/i386/pr88828-3a.c: Likewise. > * gcc.target/i386/pr88828-3b.c: Likewise. > * gcc.target/i386/pr88828-4a.c: Likewise. > * gcc.target/i386/pr88828-4b.c: Likewise. > * gcc.target/i386/pr88828-5a.c: Likewise. > * gcc.target/i386/pr88828-5b.c: Likewise. > * gcc.target/i386/pr88828-6a.c: Likewise. > * gcc.target/i386/pr88828-6b.c: Likewise. > --- > gcc/gimplify.c | 176 +++++++++++++++++++-- > gcc/testsuite/gcc.target/i386/pr88828-1.c | 16 ++ > gcc/testsuite/gcc.target/i386/pr88828-2.c | 17 ++ > gcc/testsuite/gcc.target/i386/pr88828-3a.c | 16 ++ > gcc/testsuite/gcc.target/i386/pr88828-3b.c | 18 +++ > gcc/testsuite/gcc.target/i386/pr88828-4a.c | 17 ++ > gcc/testsuite/gcc.target/i386/pr88828-4b.c | 20 +++ > gcc/testsuite/gcc.target/i386/pr88828-5a.c | 16 ++ > gcc/testsuite/gcc.target/i386/pr88828-5b.c | 18 +++ > gcc/testsuite/gcc.target/i386/pr88828-6a.c | 17 ++ > gcc/testsuite/gcc.target/i386/pr88828-6b.c | 19 +++ > 11 files changed, 336 insertions(+), 14 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-1.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-2.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-3a.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-3b.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-4a.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-4b.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-5a.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-5b.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-6a.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-6b.c > > diff --git a/gcc/gimplify.c b/gcc/gimplify.c > index 983635ba21f..893a4311f9e 100644 > --- a/gcc/gimplify.c > +++ b/gcc/gimplify.c > @@ -5082,22 +5082,170 @@ gimplify_init_constructor (tree *expr_p, gimple_seq > *pre_p, gimple_seq *post_p, > TREE_CONSTANT (ctor) = 0; > } > > - /* Vector types use CONSTRUCTOR all the way through gimple > - compilation as a general initializer. */ > - FOR_EACH_VEC_SAFE_ELT (elts, ix, ce) > + tree rhs_vector = NULL; > + /* The vector element to replace scalar elements, which > + will be overridden by scalar insert. */ > + tree vector_element = NULL; > + /* The single scalar element. */ > + tree scalar_element = NULL; > + unsigned int scalar_idx = 0; > + enum { unknown, copy, permute, init } operation = unknown; > + bool insert = false; > + > + /* Check if we can generate vector copy or permute followed by > + a single scalar insert. */ > + if (TYPE_VECTOR_SUBPARTS (type).is_constant ()) > { > - enum gimplify_status tret; > - tret = gimplify_expr (&ce->value, pre_p, post_p, is_gimple_val, > - fb_rvalue); > - if (tret == GS_ERROR) > - ret = GS_ERROR; > - else if (TREE_STATIC (ctor) > - && !initializer_constant_valid_p (ce->value, > - TREE_TYPE (ce->value))) > - TREE_STATIC (ctor) = 0; > + /* If all RHS vector elements come from the same vector, > + we can use permute. If all RHS vector elements come > + from the same vector in the same order, we can use > + copy. */ > + unsigned int nunits > + = TYPE_VECTOR_SUBPARTS (type).to_constant (); > + unsigned int nscalars = 0; > + unsigned int nvectors = 0; > + operation = unknown; > + FOR_EACH_VEC_SAFE_ELT (elts, ix, ce) > + if (TREE_CODE (ce->value) == ARRAY_REF > + || TREE_CODE (ce->value) == ARRAY_RANGE_REF) > + { > + if (!vector_element) > + vector_element = ce->value; > + /* Get the vector index. */ > + tree idx = TREE_OPERAND (ce->value, 1); > + if (TREE_CODE (idx) == INTEGER_CST) > + { > + /* Get the RHS vector. */ > + tree r = ce->value; > + while (handled_component_p (r)) > + r = TREE_OPERAND (r, 0); > + if (type == TREE_TYPE (r)) > + { > + /* The RHS vector has the same type as > + LHS. */ > + if (rhs_vector == NULL) > + rhs_vector = r; > + > + /* Check if all RHS vector elements come > + fome the same vector. */ > + if (rhs_vector == r) > + { > + nvectors++; > + if (TREE_INT_CST_LOW (idx) == ix > + && (operation == unknown > + || operation == copy)) > + operation = copy; > + else > + operation = permute; > + continue; > + } > + } > + } > + > + /* Otherwise, use vector init. */ > + break; > + } > + else if (TREE_CODE (TYPE_SIZE (TREE_TYPE (ce->value))) > + == INTEGER_CST) > + { > + /* Only allow one single scalar insert. */ > + if (nscalars != 0) > + break; > + nscalars = 1; > + insert = true; > + scalar_idx = ix; > + scalar_element = ce->value; > + } > + > + /* Allow a single scalar insert with vector copy or > + vector permute. Vector copy without insert is OK. */ > + if (nunits != (nscalars + nvectors) > + || (nscalars == 0 && operation != copy)) > + operation = unknown; > + } > + > + if (operation == unknown) > + { > + /* Default to the regular vector init constructor. */ > + operation = init; > + insert = false; > + } > + > + if (operation == copy) > + { > + /* Generate a vector copy. */ > + tree var = create_tmp_var (type); > + if (gimplify_expr (&rhs_vector, pre_p, post_p, > + is_gimple_val, fb_rvalue) == GS_ERROR) > + { > + ret = GS_ERROR; > + break; > + } > + gassign *init = gimple_build_assign (var, rhs_vector); > + gimple_seq_add_stmt (pre_p, init); > + if (gimplify_expr (&var, pre_p, post_p, is_gimple_val, > + fb_rvalue) == GS_ERROR) > + { > + ret = GS_ERROR; > + break; > + } > + /* Replace RHS with the vector copy. */ > + if (!is_gimple_reg (TREE_OPERAND (*expr_p, 0))) > + TREE_OPERAND (*expr_p, 1) = get_formal_tmp_var (var, pre_p); > + else > + TREE_OPERAND (*expr_p, 1) = var; > + } > + else > + { > + /* Prepare for vector permute by replacing the scalar > + element with the vector one. */ > + if (operation == permute) > + (elts->address())[scalar_idx].value = vector_element; > + > + /* Vector types use CONSTRUCTOR all the way through gimple > + compilation as a general initializer. */ > + FOR_EACH_VEC_SAFE_ELT (elts, ix, ce) > + { > + enum gimplify_status tret; > + tret = gimplify_expr (&ce->value, pre_p, post_p, > + is_gimple_val, > + fb_rvalue); > + if (tret == GS_ERROR) > + ret = GS_ERROR; > + else if (TREE_STATIC (ctor) > + && !initializer_constant_valid_p (ce->value, > + TREE_TYPE > (ce->value))) > + TREE_STATIC (ctor) = 0; > + } > + if (!is_gimple_reg (TREE_OPERAND (*expr_p, 0))) > + TREE_OPERAND (*expr_p, 1) = get_formal_tmp_var (ctor, pre_p); > + } > + > + if (insert) > + { > + /* Generate a single scalar insert after vector copy or > + permute. */ > + tree rhs = TREE_OPERAND (*expr_p, 1); > + tree var = create_tmp_var (type); > + gassign *init = gimple_build_assign (var, rhs); > + gimple_seq_add_stmt (pre_p, init); > + if (gimplify_expr (&scalar_element, pre_p, post_p, > + is_gimple_val, fb_rvalue) == GS_ERROR) > + { > + ret = GS_ERROR; > + break; > + } > + tree scalar_type = TREE_TYPE (scalar_element); > + tree scalar_size = TYPE_SIZE (scalar_type); > + tree bitpos = bitsize_int (scalar_idx > + * TREE_INT_CST_LOW (scalar_size)); > + tree ref = build3_loc (EXPR_LOCATION (rhs), BIT_FIELD_REF, > + scalar_type, var, scalar_size, > + bitpos); > + init = gimple_build_assign (ref, scalar_element); > + gimplify_seq_add_stmt (pre_p, init); > + TREE_OPERAND (*expr_p, 1) = var; > } > - if (!is_gimple_reg (TREE_OPERAND (*expr_p, 0))) > - TREE_OPERAND (*expr_p, 1) = get_formal_tmp_var (ctor, pre_p); > } > break; > > diff --git a/gcc/testsuite/gcc.target/i386/pr88828-1.c > b/gcc/testsuite/gcc.target/i386/pr88828-1.c > new file mode 100644 > index 00000000000..4ef1feab389 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr88828-1.c > @@ -0,0 +1,16 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -msse -mno-sse4" } */ > +/* { dg-final { scan-assembler "movss" } } */ > +/* { dg-final { scan-assembler-not "movaps" } } */ > +/* { dg-final { scan-assembler-not "movlhps" } } */ > +/* { dg-final { scan-assembler-not "unpcklps" } } */ > +/* { dg-final { scan-assembler-not "shufps" } } */ > + > +typedef float __v4sf __attribute__ ((__vector_size__ (16))); > + > +__v4sf > +foo (__v4sf x, float f) > +{ > + __v4sf y = { f, x[1], x[2], x[3] }; > + return y; > +} > diff --git a/gcc/testsuite/gcc.target/i386/pr88828-2.c > b/gcc/testsuite/gcc.target/i386/pr88828-2.c > new file mode 100644 > index 00000000000..6dc482b6f4b > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr88828-2.c > @@ -0,0 +1,17 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -msse -mno-sse4" } */ > +/* { dg-final { scan-assembler "movss" } } */ > +/* { dg-final { scan-assembler-not "movaps" } } */ > +/* { dg-final { scan-assembler-not "movlhps" } } */ > +/* { dg-final { scan-assembler-not "unpcklps" } } */ > +/* { dg-final { scan-assembler-not "shufps" } } */ > + > +typedef float __v4sf __attribute__ ((__vector_size__ (16))); > + > +__v4sf > +foo (__v4sf x, float f) > +{ > + __v4sf y = x; > + y[0] = f; > + return y; > +} > diff --git a/gcc/testsuite/gcc.target/i386/pr88828-3a.c > b/gcc/testsuite/gcc.target/i386/pr88828-3a.c > new file mode 100644 > index 00000000000..97eb8e7162a > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr88828-3a.c > @@ -0,0 +1,16 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -msse -mno-sse4" } */ > +/* { dg-final { scan-assembler "movss" } } */ > +/* { dg-final { scan-assembler-times "shufps" 1 } } */ > +/* { dg-final { scan-assembler-not "movaps" } } */ > +/* { dg-final { scan-assembler-not "movlhps" } } */ > +/* { dg-final { scan-assembler-not "unpcklps" } } */ > + > +typedef float __v4sf __attribute__ ((__vector_size__ (16))); > + > +__v4sf > +foo (__v4sf x, float f) > +{ > + __v4sf y = { f, x[0], x[2], x[3] }; > + return y; > +} > diff --git a/gcc/testsuite/gcc.target/i386/pr88828-3b.c > b/gcc/testsuite/gcc.target/i386/pr88828-3b.c > new file mode 100644 > index 00000000000..ab2ba730716 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr88828-3b.c > @@ -0,0 +1,18 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -mavx" } */ > +/* { dg-final { scan-assembler-times "vpermilps" 1 } } */ > +/* { dg-final { scan-assembler-times "vmovss" 1 { target { ! ia32 } } } } */ > +/* { dg-final { scan-assembler-times "vpinsrd" 1 { target ia32 } } } */ > +/* { dg-final { scan-assembler-not "vmovss" { target ia32 } } } */ > +/* { dg-final { scan-assembler-not "vmovaps" } } */ > +/* { dg-final { scan-assembler-not "vmovlhps" } } */ > +/* { dg-final { scan-assembler-not "vunpcklps" } } */ > + > +typedef float __v4sf __attribute__ ((__vector_size__ (16))); > + > +__v4sf > +foo (__v4sf x, float f) > +{ > + __v4sf y = { f, x[0], x[2], x[3] }; > + return y; > +} > diff --git a/gcc/testsuite/gcc.target/i386/pr88828-4a.c > b/gcc/testsuite/gcc.target/i386/pr88828-4a.c > new file mode 100644 > index 00000000000..a54689be701 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr88828-4a.c > @@ -0,0 +1,17 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -msse -mno-sse4" } */ > +/* { dg-final { scan-assembler "movss" } } */ > +/* { dg-final { scan-assembler-times "shufps" 1 } } */ > +/* { dg-final { scan-assembler-not "movaps" } } */ > +/* { dg-final { scan-assembler-not "movlhps" } } */ > +/* { dg-final { scan-assembler-not "unpcklps" } } */ > + > +typedef float __v4sf __attribute__ ((__vector_size__ (16))); > + > +__v4sf > +foo (__v4sf x, float f) > +{ > + __v4sf y = { x[0], x[2], x[3], x[1] }; > + y[0] = f; > + return y; > +} > diff --git a/gcc/testsuite/gcc.target/i386/pr88828-4b.c > b/gcc/testsuite/gcc.target/i386/pr88828-4b.c > new file mode 100644 > index 00000000000..0c3a1024d93 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr88828-4b.c > @@ -0,0 +1,20 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -mavx" } */ > +/* { dg-final { scan-assembler-times "vpermilps" 1 } } */ > +/* { dg-final { scan-assembler-times "vmovss" 1 { target { ! ia32 } } } } */ > +/* { dg-final { scan-assembler-times "vpinsrd" 1 { target ia32 } } } */ > +/* { dg-final { scan-assembler-not "vmovss" { target ia32 } } } */ > +/* { dg-final { scan-assembler-not "vshufps" } } */ > +/* { dg-final { scan-assembler-not "vmovaps" } } */ > +/* { dg-final { scan-assembler-not "vmovlhps" } } */ > +/* { dg-final { scan-assembler-not "vunpcklps" } } */ > + > +typedef float __v4sf __attribute__ ((__vector_size__ (16))); > + > +__v4sf > +foo (__v4sf x, float f) > +{ > + __v4sf y = { x[0], x[2], x[3], x[1] }; > + y[0] = f; > + return y; > +} > diff --git a/gcc/testsuite/gcc.target/i386/pr88828-5a.c > b/gcc/testsuite/gcc.target/i386/pr88828-5a.c > new file mode 100644 > index 00000000000..534808d3cd1 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr88828-5a.c > @@ -0,0 +1,16 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -msse -mno-sse4" } */ > +/* { dg-final { scan-assembler "movss" } } */ > +/* { dg-final { scan-assembler-times "shufps" 2 } } */ > +/* { dg-final { scan-assembler-times "movaps" 1 } } */ > +/* { dg-final { scan-assembler-not "movlhps" } } */ > +/* { dg-final { scan-assembler-not "unpcklps" } } */ > + > +typedef float __v4sf __attribute__ ((__vector_size__ (16))); > + > +__v4sf > +foo (__v4sf x, float f) > +{ > + __v4sf y = { x[0], x[2], x[3], f }; > + return y; > +} > diff --git a/gcc/testsuite/gcc.target/i386/pr88828-5b.c > b/gcc/testsuite/gcc.target/i386/pr88828-5b.c > new file mode 100644 > index 00000000000..aebea790979 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr88828-5b.c > @@ -0,0 +1,18 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -mavx" } */ > +/* { dg-final { scan-assembler-times "vpermilps" 1 } } */ > +/* { dg-final { scan-assembler-times "vinsertps" 1 } } */ > +/* { dg-final { scan-assembler-not "vmovss" } } */ > +/* { dg-final { scan-assembler-not "vshufps" } } */ > +/* { dg-final { scan-assembler-not "vmovaps" } } */ > +/* { dg-final { scan-assembler-not "vmovlhps" } } */ > +/* { dg-final { scan-assembler-not "vunpcklps" } } */ > + > +typedef float __v4sf __attribute__ ((__vector_size__ (16))); > + > +__v4sf > +foo (__v4sf x, float f) > +{ > + __v4sf y = { x[0], x[2], x[3], f }; > + return y; > +} > diff --git a/gcc/testsuite/gcc.target/i386/pr88828-6a.c > b/gcc/testsuite/gcc.target/i386/pr88828-6a.c > new file mode 100644 > index 00000000000..d43a36d9137 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr88828-6a.c > @@ -0,0 +1,17 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -msse -mno-sse4" } */ > +/* { dg-final { scan-assembler "movss" } } */ > +/* { dg-final { scan-assembler-times "shufps" 2 } } */ > +/* { dg-final { scan-assembler-times "movaps" 1 } } */ > +/* { dg-final { scan-assembler-not "movlhps" } } */ > +/* { dg-final { scan-assembler-not "unpcklps" } } */ > + > +typedef float __v4sf __attribute__ ((__vector_size__ (16))); > + > +__v4sf > +foo (__v4sf x, float f) > +{ > + __v4sf y = { x[0], x[2], x[3], x[0] }; > + y[3] = f; > + return y; > +} > diff --git a/gcc/testsuite/gcc.target/i386/pr88828-6b.c > b/gcc/testsuite/gcc.target/i386/pr88828-6b.c > new file mode 100644 > index 00000000000..6856fe6500e > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr88828-6b.c > @@ -0,0 +1,19 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -mavx" } */ > +/* { dg-final { scan-assembler-times "vpermilps" 1 } } */ > +/* { dg-final { scan-assembler-times "vinsertps" 1 } } */ > +/* { dg-final { scan-assembler-not "vshufps" } } */ > +/* { dg-final { scan-assembler-not "vmovss" } } */ > +/* { dg-final { scan-assembler-not "vmovaps" } } */ > +/* { dg-final { scan-assembler-not "vmovlhps" } } */ > +/* { dg-final { scan-assembler-not "vunpcklps" } } */ > + > +typedef float __v4sf __attribute__ ((__vector_size__ (16))); > + > +__v4sf > +foo (__v4sf x, float f) > +{ > + __v4sf y = { x[0], x[2], x[3], x[0] }; > + y[3] = f; > + return y; > +} > -- > 2.20.1 >