)
,On Sun, Mar 3, 2019 at 6:32 AM H.J. Lu <[email protected]> wrote:
>
> For vector init constructor:
>
> ---
> typedef float __v4sf __attribute__ ((__vector_size__ (16)));
>
> __v4sf
> foo (__v4sf x, float f)
> {
> __v4sf y = { f, x[1], x[2], x[3] };
> return y;
> }
> ---
>
> we can optimize vector init constructor with vector copy or permute
> followed by a single scalar insert:
>
> __v4sf D.1912;
> __v4sf D.1913;
> __v4sf D.1914;
> __v4sf y;
>
> x.0_1 = x;
> D.1912 = x.0_1;
> _2 = D.1912;
> D.1913 = _2;
> BIT_FIELD_REF <D.1913, 32, 0> = f;
> y = D.1913;
> D.1914 = y;
> return D.1914;
>
> instead of
>
> __v4sf D.1962;
> __v4sf y;
>
> _1 = BIT_FIELD_REF <x, 32, 32>;
> _2 = BIT_FIELD_REF <x, 32, 64>;
> _3 = BIT_FIELD_REF <x, 32, 96>;
> y = {f, _1, _2, _3};
> D.1962 = y;
> return D.1962;
>
> gcc/
>
> PR tree-optimization/88828
> * gimplify.c (gimplify_init_constructor): Optimize vector init
> constructor with vector copy or permute followed by a single
> scalar insert.
Doing this here does not catch things like:
typedef float __v4sf __attribute__ ((__vector_size__ (16)));
__v4sf
vector_init (float f0,float f1, float f2,float f3)
{
__v4sf y = { f, x[1], x[2], x[3] };
return y;
}
__v4sf
foo (__v4sf x, float f)
{
return vector_init (f, x[1], x[2], x[3]) ;
}
>
> gcc/testsuite/
>
> PR tree-optimization/88828
> * gcc.target/i386/pr88828-1.c: New test.
> * gcc.target/i386/pr88828-2.c: Likewise.
> * gcc.target/i386/pr88828-3a.c: Likewise.
> * gcc.target/i386/pr88828-3b.c: Likewise.
> * gcc.target/i386/pr88828-4a.c: Likewise.
> * gcc.target/i386/pr88828-4b.c: Likewise.
> * gcc.target/i386/pr88828-5a.c: Likewise.
> * gcc.target/i386/pr88828-5b.c: Likewise.
> * gcc.target/i386/pr88828-6a.c: Likewise.
> * gcc.target/i386/pr88828-6b.c: Likewise.
> ---
> gcc/gimplify.c | 176 +++++++++++++++++++--
> gcc/testsuite/gcc.target/i386/pr88828-1.c | 16 ++
> gcc/testsuite/gcc.target/i386/pr88828-2.c | 17 ++
> gcc/testsuite/gcc.target/i386/pr88828-3a.c | 16 ++
> gcc/testsuite/gcc.target/i386/pr88828-3b.c | 18 +++
> gcc/testsuite/gcc.target/i386/pr88828-4a.c | 17 ++
> gcc/testsuite/gcc.target/i386/pr88828-4b.c | 20 +++
> gcc/testsuite/gcc.target/i386/pr88828-5a.c | 16 ++
> gcc/testsuite/gcc.target/i386/pr88828-5b.c | 18 +++
> gcc/testsuite/gcc.target/i386/pr88828-6a.c | 17 ++
> gcc/testsuite/gcc.target/i386/pr88828-6b.c | 19 +++
> 11 files changed, 336 insertions(+), 14 deletions(-)
> create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-1.c
> create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-2.c
> create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-3a.c
> create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-3b.c
> create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-4a.c
> create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-4b.c
> create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-5a.c
> create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-5b.c
> create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-6a.c
> create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-6b.c
>
> diff --git a/gcc/gimplify.c b/gcc/gimplify.c
> index 983635ba21f..893a4311f9e 100644
> --- a/gcc/gimplify.c
> +++ b/gcc/gimplify.c
> @@ -5082,22 +5082,170 @@ gimplify_init_constructor (tree *expr_p, gimple_seq
> *pre_p, gimple_seq *post_p,
> TREE_CONSTANT (ctor) = 0;
> }
>
> - /* Vector types use CONSTRUCTOR all the way through gimple
> - compilation as a general initializer. */
> - FOR_EACH_VEC_SAFE_ELT (elts, ix, ce)
> + tree rhs_vector = NULL;
> + /* The vector element to replace scalar elements, which
> + will be overridden by scalar insert. */
> + tree vector_element = NULL;
> + /* The single scalar element. */
> + tree scalar_element = NULL;
> + unsigned int scalar_idx = 0;
> + enum { unknown, copy, permute, init } operation = unknown;
> + bool insert = false;
> +
> + /* Check if we can generate vector copy or permute followed by
> + a single scalar insert. */
> + if (TYPE_VECTOR_SUBPARTS (type).is_constant ())
> {
> - enum gimplify_status tret;
> - tret = gimplify_expr (&ce->value, pre_p, post_p, is_gimple_val,
> - fb_rvalue);
> - if (tret == GS_ERROR)
> - ret = GS_ERROR;
> - else if (TREE_STATIC (ctor)
> - && !initializer_constant_valid_p (ce->value,
> - TREE_TYPE (ce->value)))
> - TREE_STATIC (ctor) = 0;
> + /* If all RHS vector elements come from the same vector,
> + we can use permute. If all RHS vector elements come
> + from the same vector in the same order, we can use
> + copy. */
> + unsigned int nunits
> + = TYPE_VECTOR_SUBPARTS (type).to_constant ();
> + unsigned int nscalars = 0;
> + unsigned int nvectors = 0;
> + operation = unknown;
> + FOR_EACH_VEC_SAFE_ELT (elts, ix, ce)
> + if (TREE_CODE (ce->value) == ARRAY_REF
> + || TREE_CODE (ce->value) == ARRAY_RANGE_REF)
> + {
> + if (!vector_element)
> + vector_element = ce->value;
> + /* Get the vector index. */
> + tree idx = TREE_OPERAND (ce->value, 1);
> + if (TREE_CODE (idx) == INTEGER_CST)
> + {
> + /* Get the RHS vector. */
> + tree r = ce->value;
> + while (handled_component_p (r))
> + r = TREE_OPERAND (r, 0);
> + if (type == TREE_TYPE (r))
> + {
> + /* The RHS vector has the same type as
> + LHS. */
> + if (rhs_vector == NULL)
> + rhs_vector = r;
> +
> + /* Check if all RHS vector elements come
> + fome the same vector. */
> + if (rhs_vector == r)
> + {
> + nvectors++;
> + if (TREE_INT_CST_LOW (idx) == ix
> + && (operation == unknown
> + || operation == copy))
> + operation = copy;
> + else
> + operation = permute;
> + continue;
> + }
> + }
> + }
> +
> + /* Otherwise, use vector init. */
> + break;
> + }
> + else if (TREE_CODE (TYPE_SIZE (TREE_TYPE (ce->value)))
> + == INTEGER_CST)
> + {
> + /* Only allow one single scalar insert. */
> + if (nscalars != 0)
> + break;
> + nscalars = 1;
> + insert = true;
> + scalar_idx = ix;
> + scalar_element = ce->value;
> + }
> +
> + /* Allow a single scalar insert with vector copy or
> + vector permute. Vector copy without insert is OK. */
> + if (nunits != (nscalars + nvectors)
> + || (nscalars == 0 && operation != copy))
> + operation = unknown;
> + }
> +
> + if (operation == unknown)
> + {
> + /* Default to the regular vector init constructor. */
> + operation = init;
> + insert = false;
> + }
> +
> + if (operation == copy)
> + {
> + /* Generate a vector copy. */
> + tree var = create_tmp_var (type);
> + if (gimplify_expr (&rhs_vector, pre_p, post_p,
> + is_gimple_val, fb_rvalue) == GS_ERROR)
> + {
> + ret = GS_ERROR;
> + break;
> + }
> + gassign *init = gimple_build_assign (var, rhs_vector);
> + gimple_seq_add_stmt (pre_p, init);
> + if (gimplify_expr (&var, pre_p, post_p, is_gimple_val,
> + fb_rvalue) == GS_ERROR)
> + {
> + ret = GS_ERROR;
> + break;
> + }
> + /* Replace RHS with the vector copy. */
> + if (!is_gimple_reg (TREE_OPERAND (*expr_p, 0)))
> + TREE_OPERAND (*expr_p, 1) = get_formal_tmp_var (var, pre_p);
> + else
> + TREE_OPERAND (*expr_p, 1) = var;
> + }
> + else
> + {
> + /* Prepare for vector permute by replacing the scalar
> + element with the vector one. */
> + if (operation == permute)
> + (elts->address())[scalar_idx].value = vector_element;
> +
> + /* Vector types use CONSTRUCTOR all the way through gimple
> + compilation as a general initializer. */
> + FOR_EACH_VEC_SAFE_ELT (elts, ix, ce)
> + {
> + enum gimplify_status tret;
> + tret = gimplify_expr (&ce->value, pre_p, post_p,
> + is_gimple_val,
> + fb_rvalue);
> + if (tret == GS_ERROR)
> + ret = GS_ERROR;
> + else if (TREE_STATIC (ctor)
> + && !initializer_constant_valid_p (ce->value,
> + TREE_TYPE
> (ce->value)))
> + TREE_STATIC (ctor) = 0;
> + }
> + if (!is_gimple_reg (TREE_OPERAND (*expr_p, 0)))
> + TREE_OPERAND (*expr_p, 1) = get_formal_tmp_var (ctor, pre_p);
> + }
> +
> + if (insert)
> + {
> + /* Generate a single scalar insert after vector copy or
> + permute. */
> + tree rhs = TREE_OPERAND (*expr_p, 1);
> + tree var = create_tmp_var (type);
> + gassign *init = gimple_build_assign (var, rhs);
> + gimple_seq_add_stmt (pre_p, init);
> + if (gimplify_expr (&scalar_element, pre_p, post_p,
> + is_gimple_val, fb_rvalue) == GS_ERROR)
> + {
> + ret = GS_ERROR;
> + break;
> + }
> + tree scalar_type = TREE_TYPE (scalar_element);
> + tree scalar_size = TYPE_SIZE (scalar_type);
> + tree bitpos = bitsize_int (scalar_idx
> + * TREE_INT_CST_LOW (scalar_size));
> + tree ref = build3_loc (EXPR_LOCATION (rhs), BIT_FIELD_REF,
> + scalar_type, var, scalar_size,
> + bitpos);
> + init = gimple_build_assign (ref, scalar_element);
> + gimplify_seq_add_stmt (pre_p, init);
> + TREE_OPERAND (*expr_p, 1) = var;
> }
> - if (!is_gimple_reg (TREE_OPERAND (*expr_p, 0)))
> - TREE_OPERAND (*expr_p, 1) = get_formal_tmp_var (ctor, pre_p);
> }
> break;
>
> diff --git a/gcc/testsuite/gcc.target/i386/pr88828-1.c
> b/gcc/testsuite/gcc.target/i386/pr88828-1.c
> new file mode 100644
> index 00000000000..4ef1feab389
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr88828-1.c
> @@ -0,0 +1,16 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -msse -mno-sse4" } */
> +/* { dg-final { scan-assembler "movss" } } */
> +/* { dg-final { scan-assembler-not "movaps" } } */
> +/* { dg-final { scan-assembler-not "movlhps" } } */
> +/* { dg-final { scan-assembler-not "unpcklps" } } */
> +/* { dg-final { scan-assembler-not "shufps" } } */
> +
> +typedef float __v4sf __attribute__ ((__vector_size__ (16)));
> +
> +__v4sf
> +foo (__v4sf x, float f)
> +{
> + __v4sf y = { f, x[1], x[2], x[3] };
> + return y;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr88828-2.c
> b/gcc/testsuite/gcc.target/i386/pr88828-2.c
> new file mode 100644
> index 00000000000..6dc482b6f4b
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr88828-2.c
> @@ -0,0 +1,17 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -msse -mno-sse4" } */
> +/* { dg-final { scan-assembler "movss" } } */
> +/* { dg-final { scan-assembler-not "movaps" } } */
> +/* { dg-final { scan-assembler-not "movlhps" } } */
> +/* { dg-final { scan-assembler-not "unpcklps" } } */
> +/* { dg-final { scan-assembler-not "shufps" } } */
> +
> +typedef float __v4sf __attribute__ ((__vector_size__ (16)));
> +
> +__v4sf
> +foo (__v4sf x, float f)
> +{
> + __v4sf y = x;
> + y[0] = f;
> + return y;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr88828-3a.c
> b/gcc/testsuite/gcc.target/i386/pr88828-3a.c
> new file mode 100644
> index 00000000000..97eb8e7162a
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr88828-3a.c
> @@ -0,0 +1,16 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -msse -mno-sse4" } */
> +/* { dg-final { scan-assembler "movss" } } */
> +/* { dg-final { scan-assembler-times "shufps" 1 } } */
> +/* { dg-final { scan-assembler-not "movaps" } } */
> +/* { dg-final { scan-assembler-not "movlhps" } } */
> +/* { dg-final { scan-assembler-not "unpcklps" } } */
> +
> +typedef float __v4sf __attribute__ ((__vector_size__ (16)));
> +
> +__v4sf
> +foo (__v4sf x, float f)
> +{
> + __v4sf y = { f, x[0], x[2], x[3] };
> + return y;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr88828-3b.c
> b/gcc/testsuite/gcc.target/i386/pr88828-3b.c
> new file mode 100644
> index 00000000000..ab2ba730716
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr88828-3b.c
> @@ -0,0 +1,18 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx" } */
> +/* { dg-final { scan-assembler-times "vpermilps" 1 } } */
> +/* { dg-final { scan-assembler-times "vmovss" 1 { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vpinsrd" 1 { target ia32 } } } */
> +/* { dg-final { scan-assembler-not "vmovss" { target ia32 } } } */
> +/* { dg-final { scan-assembler-not "vmovaps" } } */
> +/* { dg-final { scan-assembler-not "vmovlhps" } } */
> +/* { dg-final { scan-assembler-not "vunpcklps" } } */
> +
> +typedef float __v4sf __attribute__ ((__vector_size__ (16)));
> +
> +__v4sf
> +foo (__v4sf x, float f)
> +{
> + __v4sf y = { f, x[0], x[2], x[3] };
> + return y;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr88828-4a.c
> b/gcc/testsuite/gcc.target/i386/pr88828-4a.c
> new file mode 100644
> index 00000000000..a54689be701
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr88828-4a.c
> @@ -0,0 +1,17 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -msse -mno-sse4" } */
> +/* { dg-final { scan-assembler "movss" } } */
> +/* { dg-final { scan-assembler-times "shufps" 1 } } */
> +/* { dg-final { scan-assembler-not "movaps" } } */
> +/* { dg-final { scan-assembler-not "movlhps" } } */
> +/* { dg-final { scan-assembler-not "unpcklps" } } */
> +
> +typedef float __v4sf __attribute__ ((__vector_size__ (16)));
> +
> +__v4sf
> +foo (__v4sf x, float f)
> +{
> + __v4sf y = { x[0], x[2], x[3], x[1] };
> + y[0] = f;
> + return y;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr88828-4b.c
> b/gcc/testsuite/gcc.target/i386/pr88828-4b.c
> new file mode 100644
> index 00000000000..0c3a1024d93
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr88828-4b.c
> @@ -0,0 +1,20 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx" } */
> +/* { dg-final { scan-assembler-times "vpermilps" 1 } } */
> +/* { dg-final { scan-assembler-times "vmovss" 1 { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-times "vpinsrd" 1 { target ia32 } } } */
> +/* { dg-final { scan-assembler-not "vmovss" { target ia32 } } } */
> +/* { dg-final { scan-assembler-not "vshufps" } } */
> +/* { dg-final { scan-assembler-not "vmovaps" } } */
> +/* { dg-final { scan-assembler-not "vmovlhps" } } */
> +/* { dg-final { scan-assembler-not "vunpcklps" } } */
> +
> +typedef float __v4sf __attribute__ ((__vector_size__ (16)));
> +
> +__v4sf
> +foo (__v4sf x, float f)
> +{
> + __v4sf y = { x[0], x[2], x[3], x[1] };
> + y[0] = f;
> + return y;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr88828-5a.c
> b/gcc/testsuite/gcc.target/i386/pr88828-5a.c
> new file mode 100644
> index 00000000000..534808d3cd1
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr88828-5a.c
> @@ -0,0 +1,16 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -msse -mno-sse4" } */
> +/* { dg-final { scan-assembler "movss" } } */
> +/* { dg-final { scan-assembler-times "shufps" 2 } } */
> +/* { dg-final { scan-assembler-times "movaps" 1 } } */
> +/* { dg-final { scan-assembler-not "movlhps" } } */
> +/* { dg-final { scan-assembler-not "unpcklps" } } */
> +
> +typedef float __v4sf __attribute__ ((__vector_size__ (16)));
> +
> +__v4sf
> +foo (__v4sf x, float f)
> +{
> + __v4sf y = { x[0], x[2], x[3], f };
> + return y;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr88828-5b.c
> b/gcc/testsuite/gcc.target/i386/pr88828-5b.c
> new file mode 100644
> index 00000000000..aebea790979
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr88828-5b.c
> @@ -0,0 +1,18 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx" } */
> +/* { dg-final { scan-assembler-times "vpermilps" 1 } } */
> +/* { dg-final { scan-assembler-times "vinsertps" 1 } } */
> +/* { dg-final { scan-assembler-not "vmovss" } } */
> +/* { dg-final { scan-assembler-not "vshufps" } } */
> +/* { dg-final { scan-assembler-not "vmovaps" } } */
> +/* { dg-final { scan-assembler-not "vmovlhps" } } */
> +/* { dg-final { scan-assembler-not "vunpcklps" } } */
> +
> +typedef float __v4sf __attribute__ ((__vector_size__ (16)));
> +
> +__v4sf
> +foo (__v4sf x, float f)
> +{
> + __v4sf y = { x[0], x[2], x[3], f };
> + return y;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr88828-6a.c
> b/gcc/testsuite/gcc.target/i386/pr88828-6a.c
> new file mode 100644
> index 00000000000..d43a36d9137
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr88828-6a.c
> @@ -0,0 +1,17 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -msse -mno-sse4" } */
> +/* { dg-final { scan-assembler "movss" } } */
> +/* { dg-final { scan-assembler-times "shufps" 2 } } */
> +/* { dg-final { scan-assembler-times "movaps" 1 } } */
> +/* { dg-final { scan-assembler-not "movlhps" } } */
> +/* { dg-final { scan-assembler-not "unpcklps" } } */
> +
> +typedef float __v4sf __attribute__ ((__vector_size__ (16)));
> +
> +__v4sf
> +foo (__v4sf x, float f)
> +{
> + __v4sf y = { x[0], x[2], x[3], x[0] };
> + y[3] = f;
> + return y;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr88828-6b.c
> b/gcc/testsuite/gcc.target/i386/pr88828-6b.c
> new file mode 100644
> index 00000000000..6856fe6500e
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr88828-6b.c
> @@ -0,0 +1,19 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx" } */
> +/* { dg-final { scan-assembler-times "vpermilps" 1 } } */
> +/* { dg-final { scan-assembler-times "vinsertps" 1 } } */
> +/* { dg-final { scan-assembler-not "vshufps" } } */
> +/* { dg-final { scan-assembler-not "vmovss" } } */
> +/* { dg-final { scan-assembler-not "vmovaps" } } */
> +/* { dg-final { scan-assembler-not "vmovlhps" } } */
> +/* { dg-final { scan-assembler-not "vunpcklps" } } */
> +
> +typedef float __v4sf __attribute__ ((__vector_size__ (16)));
> +
> +__v4sf
> +foo (__v4sf x, float f)
> +{
> + __v4sf y = { x[0], x[2], x[3], x[0] };
> + y[3] = f;
> + return y;
> +}
> --
> 2.20.1
>