On Wed, Mar 6, 2019 at 8:33 PM Richard Biener <richard.guent...@gmail.com> wrote: > > On Wed, Mar 6, 2019 at 8:46 AM H.J. Lu <hjl.to...@gmail.com> wrote: > > > > On Tue, Mar 5, 2019 at 1:46 AM H.J. Lu <hjl.to...@gmail.com> wrote: > > > > > > On Mon, Mar 04, 2019 at 12:55:04PM +0100, Richard Biener wrote: > > > > On Sun, Mar 3, 2019 at 10:13 PM H.J. Lu <hjl.to...@gmail.com> wrote: > > > > > > > > > > On Sun, Mar 03, 2019 at 06:40:09AM -0800, Andrew Pinski wrote: > > > > > > ) > > > > > > ,On Sun, Mar 3, 2019 at 6:32 AM H.J. Lu <hjl.to...@gmail.com> wrote: > > > > > > > > > > > > > > For vector init constructor: > > > > > > > > > > > > > > --- > > > > > > > typedef float __v4sf __attribute__ ((__vector_size__ (16))); > > > > > > > > > > > > > > __v4sf > > > > > > > foo (__v4sf x, float f) > > > > > > > { > > > > > > > __v4sf y = { f, x[1], x[2], x[3] }; > > > > > > > return y; > > > > > > > } > > > > > > > --- > > > > > > > > > > > > > > we can optimize vector init constructor with vector copy or > > > > > > > permute > > > > > > > followed by a single scalar insert: > > > > > > > and you want to advance to the _1 = BIT_INSERT_EXPR here. The easiest > > > > way > > > > is to emit a new stmt for _2 = copy ...; and do the set_rhs with the > > > > BIT_INSERT_EXPR. > > > > > > Thanks for BIT_INSERT_EXPR suggestion. I am testing this patch. > > > > > > > > > H.J. > > > --- > > > We can optimize vector constructor with vector copy or permute followed > > > by a single scalar insert: > > > > > > __v4sf y; > > > __v4sf D.1930; > > > float _1; > > > float _2; > > > float _3; > > > > > > <bb 2> : > > > _1 = BIT_FIELD_REF <x_9(D), 32, 96>; > > > _2 = BIT_FIELD_REF <x_9(D), 32, 64>; > > > _3 = BIT_FIELD_REF <x_9(D), 32, 32>; > > > y_6 = {f_5(D), _3, _2, _1}; > > > return y_6; > > > > > > with > > > > > > __v4sf y; > > > __v4sf D.1930; > > > float _1; > > > float _2; > > > float _3; > > > vector(4) float _8; > > > > > > <bb 2> : > > > _1 = BIT_FIELD_REF <x_9(D), 32, 96>; > > > _2 = BIT_FIELD_REF <x_9(D), 32, 64>; > > > _3 = BIT_FIELD_REF <x_9(D), 32, 32>; > > > _8 = x_9(D); > > > y_6 = BIT_INSERT_EXPR <x_9(D), f_5(D), 0 (32 bits)>; > > > return y_6; > > > > > > gcc/ > > > > > > PR tree-optimization/88828 > > > * tree-ssa-forwprop.c (simplify_vector_constructor): Optimize > > > vector init constructor with vector copy or permute followed > > > by a single scalar insert. > > > > > > gcc/testsuite/ > > > > > > PR tree-optimization/88828 > > > * gcc.target/i386/pr88828-1a.c: New test. > > > * gcc.target/i386/pr88828-2b.c: Likewise. > > > * gcc.target/i386/pr88828-2.c: Likewise. > > > * gcc.target/i386/pr88828-3a.c: Likewise. > > > * gcc.target/i386/pr88828-3b.c: Likewise. > > > * gcc.target/i386/pr88828-3c.c: Likewise. > > > * gcc.target/i386/pr88828-3d.c: Likewise. > > > * gcc.target/i386/pr88828-4a.c: Likewise. > > > * gcc.target/i386/pr88828-4b.c: Likewise. > > > * gcc.target/i386/pr88828-5a.c: Likewise. > > > * gcc.target/i386/pr88828-5b.c: Likewise. > > > * gcc.target/i386/pr88828-6a.c: Likewise. > > > * gcc.target/i386/pr88828-6b.c: Likewise. > > > > Here is the updated patch with run-time tests. > > - if (TREE_CODE (elt->value) != SSA_NAME) > + if (TREE_CODE (ce->value) != SSA_NAME) > return false; > > hmm, so it doesn't allow { 0, v[1], v[2], v[3] }? I think the single > scalar value can be a constant as well.
Fixed. > if (!def_stmt) > - return false; > + { > + if (gimple_nop_p (SSA_NAME_DEF_STMT (ce->value))) > > if (SSA_NAME_IS_DEFAULT_DEF (ce->value)) > > + { > > also you seem to disallow > > { i + 1, v[1], v[2], v[3] } Fixed by if (code != BIT_FIELD_REF) { /* Only allow one scalar insert. */ if (nscalars != 0) return false; nscalars = 1; insert = true; scalar_idx = i; sel.quick_push (i); scalar_element = ce->value; continue; } > because get_prop_source_stmt will return the definition computing > i + 1 in this case and your code will be skipped? > > I think you can simplify the code by treating scalar_element != NULL > as nscalars == 1 and eliding nscalars. It works only if TYPE_VECTOR_SUBPARTS (type).to_constant () == (nscalars + nvectors) We need to check both nscalars and nvectors. Elide nscalar check doesn't help much here. > - if (conv_code == ERROR_MARK) > + if (conv_code == ERROR_MARK && !insert) > gimple_assign_set_rhs_with_ops (gsi, VEC_PERM_EXPR, orig[0], > orig[1], op2); > else > @@ -2148,10 +2198,25 @@ simplify_vector_constructor (gimple_stmt_iterator > *gsi) > VEC_PERM_EXPR, orig[0], orig[1], op2); > orig[0] = gimple_assign_lhs (perm); > gsi_insert_before (gsi, perm, GSI_SAME_STMT); > - gimple_assign_set_rhs_with_ops (gsi, conv_code, orig[0], > + gimple_assign_set_rhs_with_ops (gsi, > + (conv_code != ERROR_MARK > + ? conv_code > + : NOP_EXPR), > + orig[0], > NULL_TREE, NULL_TREE); > > I believe you should elide the last stmt for conv_code == ERROR_MARK, > that is, why did you need to add the && !insert check in the guarding > condition When conv_code == ERROR_MARK, we still need gimple *perm = gimple_build_assign (make_ssa_name (TREE_TYPE (orig[0])), VEC_PERM_EXPR, orig[0], orig[1], op2); orig[0] = gimple_assign_lhs (perm); gsi_insert_before (gsi, perm, GSI_SAME_STMT); gimple_assign_set_rhs_with_ops (gsi, NOP_EXPR, orig[0], NULL_TREE, NULL_TREE); Otherwise, scalar insert won't work. > (this path should already do the correct thing?). Note that in all > cases it looks > that with conv_code != ERROR_MARK you may end up doing a float->int > or int->float conversion on a value it wasn't done on before which might > raise exceptions? That is, do we need to make sure we permute a > value we already do convert into the place we're going to insert to? This couldn't happen: if (type == TREE_TYPE (ref)) { /* The RHS vector has the same type as LHS. */ if (rhs_vector == NULL) rhs_vector = ref; /* Check if all RHS vector elements come fome the same vector. */ if (rhs_vector == ref) nvectors++; } ... if (insert && (nvectors == 0 || (TYPE_VECTOR_SUBPARTS (type).to_constant () != (nscalars + nvectors)))) return false; > + if (insert) > + { > + /* Generate a single scalar insert. */ > + tree var = make_ssa_name (type); > + tree val = gimple_assign_rhs1 (stmt); > + gimple *copy = gimple_build_assign (var, val); > > I believe this doesn't properly copy the stmt in case it is a permute. > You can use (note the use of gsi_stmt - gimple_assign_set_rhs_with_ops > can re-allocate the stmt) > > gimple *copy = gimple_copy (gsi_stmt (*gsi)); > gimple_assign_set_lhs (copy, var); Fixed. > + gsi_insert_before (gsi, copy, GSI_SAME_STMT); > + tree bitpos = bitsize_int (scalar_idx * elem_size); > + gimple_assign_set_rhs_with_ops (gsi, BIT_INSERT_EXPR, var, > + scalar_element, bitpos); > + } > > Otherwise looks OK to me. > > As separate followup patch it might be interesting to support > > { 0, a[1], a[2], 3 } > > kinds as well, thus combining a VECTOR_CST (which is > reasonably cheap to create) with another vector. That should > be maybe done as a first patch given this is just a two-vector > permute which the code already handles apart from not > recognizing the implicit constant vector participating. > > Similar > > { 0, a[1], b[2], 3 } > > where the combination of a and b is blended with another > constant vector. I'm not sure if handling an arbitrary number > of scalar elements should be done in a similar way, that is, > implementing > > { s1, a[1], a[2], s2, s3, b[0], b[1], b[2] } > > as > > tem = VEC_PERM <a, b, { ... }> > tem2 = { s1, 0, 0, s2, s3, 0, 0, 0 } > res = VEC_PERM <tem, tem2, { blend-mask }> > > where constructing tem2 should take at most > N-1 inserts (the first element to insert into tem2 > can use a splat or if element zero a zero-extending move). > > Doing this effectively lifts the restriction of only > handling two vectors - we'd incrementally do > two-vector permute plus blend of the rest which has > its constructor re-processed. > > But as said - the code is already a bit awkward so changing > this in multiple reivisions is preferred and the single-element > case is certainly sth to do via a BIT_INSERT_EXPR. Agreed. I am testing this updated patch. -- H.J.
From 91e060c13b3d729772c530640ee1e692d80667e9 Mon Sep 17 00:00:00 2001 From: "H.J. Lu" <hjl.to...@gmail.com> Date: Tue, 5 Feb 2019 15:39:27 -0800 Subject: [PATCH] Optimize vector constructor We can optimize vector constructor with vector copy or permute followed by a single scalar insert: __v4sf y; __v4sf D.1930; float _1; float _2; float _3; <bb 2> : _1 = BIT_FIELD_REF <x_9(D), 32, 96>; _2 = BIT_FIELD_REF <x_9(D), 32, 64>; _3 = BIT_FIELD_REF <x_9(D), 32, 32>; y_6 = {f_5(D), _3, _2, _1}; return y_6; with __v4sf y; __v4sf D.1930; float _1; float _2; float _3; vector(4) float _8; <bb 2> : _1 = BIT_FIELD_REF <x_9(D), 32, 96>; _2 = BIT_FIELD_REF <x_9(D), 32, 64>; _3 = BIT_FIELD_REF <x_9(D), 32, 32>; _8 = x_9(D); y_6 = BIT_INSERT_EXPR <x_9(D), f_5(D), 0 (32 bits)>; return y_6; gcc/ PR tree-optimization/88828 * tree-ssa-forwprop.c (simplify_vector_constructor): Optimize vector init constructor with vector copy or permute followed by a single scalar insert. gcc/testsuite/ PR tree-optimization/88828 * gcc.target/i386/pr88828-1.c: New test. * gcc.target/i386/pr88828-1a.c: Likewise. * gcc.target/i386/pr88828-1b.c: Likewise. * gcc.target/i386/pr88828-1c.c: Likewise. * gcc.target/i386/pr88828-2.c: Likewise. * gcc.target/i386/pr88828-2a.c: Likewise. * gcc.target/i386/pr88828-2b.c: Likewise. * gcc.target/i386/pr88828-2c.c: Likewise. * gcc.target/i386/pr88828-2d.c: Likewise. * gcc.target/i386/pr88828-3.c: Likewise. * gcc.target/i386/pr88828-3a.c: Likewise. * gcc.target/i386/pr88828-3b.c: Likewise. * gcc.target/i386/pr88828-3c.c: Likewise. * gcc.target/i386/pr88828-3d.c: Likewise. * gcc.target/i386/pr88828-4a.c: Likewise. * gcc.target/i386/pr88828-4b.c: Likewise. * gcc.target/i386/pr88828-5a.c: Likewise. * gcc.target/i386/pr88828-5b.c: Likewise. * gcc.target/i386/pr88828-6.c: Likewise. * gcc.target/i386/pr88828-6a.c: Likewise. * gcc.target/i386/pr88828-6b.c: Likewise. * gcc.target/i386/pr88828-7.c: Likewise. * gcc.target/i386/pr88828-7a.c: Likewise. * gcc.target/i386/pr88828-7b.c: Likewise. * gcc.target/i386/pr88828-8.c: Likewise. * gcc.target/i386/pr88828-8a.c: Likewise. * gcc.target/i386/pr88828-8b.c: Likewise. --- gcc/testsuite/gcc.target/i386/pr88828-1.c | 49 +++++++++++ gcc/testsuite/gcc.target/i386/pr88828-1a.c | 17 ++++ gcc/testsuite/gcc.target/i386/pr88828-1b.c | 23 ++++++ gcc/testsuite/gcc.target/i386/pr88828-1c.c | 18 ++++ gcc/testsuite/gcc.target/i386/pr88828-2.c | 51 ++++++++++++ gcc/testsuite/gcc.target/i386/pr88828-2a.c | 17 ++++ gcc/testsuite/gcc.target/i386/pr88828-2b.c | 19 +++++ gcc/testsuite/gcc.target/i386/pr88828-2c.c | 23 ++++++ gcc/testsuite/gcc.target/i386/pr88828-2d.c | 25 ++++++ gcc/testsuite/gcc.target/i386/pr88828-3.c | 54 ++++++++++++ gcc/testsuite/gcc.target/i386/pr88828-3a.c | 17 ++++ gcc/testsuite/gcc.target/i386/pr88828-3b.c | 19 +++++ gcc/testsuite/gcc.target/i386/pr88828-3c.c | 25 ++++++ gcc/testsuite/gcc.target/i386/pr88828-4a.c | 18 ++++ gcc/testsuite/gcc.target/i386/pr88828-4b.c | 21 +++++ gcc/testsuite/gcc.target/i386/pr88828-5a.c | 18 ++++ gcc/testsuite/gcc.target/i386/pr88828-5b.c | 20 +++++ gcc/testsuite/gcc.target/i386/pr88828-6.c | 47 +++++++++++ gcc/testsuite/gcc.target/i386/pr88828-6a.c | 16 ++++ gcc/testsuite/gcc.target/i386/pr88828-6b.c | 22 +++++ gcc/testsuite/gcc.target/i386/pr88828-7.c | 53 ++++++++++++ gcc/testsuite/gcc.target/i386/pr88828-7a.c | 16 ++++ gcc/testsuite/gcc.target/i386/pr88828-7b.c | 22 +++++ gcc/testsuite/gcc.target/i386/pr88828-8.c | 46 +++++++++++ gcc/testsuite/gcc.target/i386/pr88828-8a.c | 15 ++++ gcc/testsuite/gcc.target/i386/pr88828-8b.c | 21 +++++ gcc/tree-ssa-forwprop.c | 96 +++++++++++++++++++--- 27 files changed, 776 insertions(+), 12 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-1.c create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-1a.c create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-1b.c create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-1c.c create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-2.c create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-2a.c create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-2b.c create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-2c.c create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-2d.c create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-3.c create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-3a.c create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-3b.c create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-3c.c create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-4a.c create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-4b.c create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-5a.c create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-5b.c create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-6.c create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-6a.c create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-6b.c create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-7.c create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-7a.c create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-7b.c create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-8.c create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-8a.c create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-8b.c diff --git a/gcc/testsuite/gcc.target/i386/pr88828-1.c b/gcc/testsuite/gcc.target/i386/pr88828-1.c new file mode 100644 index 00000000000..a15d1fea3f5 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr88828-1.c @@ -0,0 +1,49 @@ +/* { dg-do run { target sse2_runtime } } */ +/* { dg-options "-O2 -msse2" } */ + +#include "pr88828-1a.c" +#include "pr88828-1b.c" +#include "pr88828-1c.c" + +extern void abort (); + +void +do_check (__v4sf y, float f[4], float z) +{ + int i; + + for (i = 0; i < 4; i++) + if (i == 0) + { + if (y[i] != z) + abort (); + } + else + { + if (y[i] != f[i]) + abort (); + } +} + +int +main (void) +{ + float f[4] = { -11, 2, 55553, -4 }; + float z = 134567; + __v4sf x = { f[0], f[1], f[2], f[3] }; + __v4sf y; + int i; + + for (i = 0; i < 4; i++) + if (x[i] != f[i]) + abort (); + + y = foo1 (x, z); + do_check (y, f, z); + y = foo2 (x, z); + do_check (y, f, z); + y = foo3 (x, z); + do_check (y, f, z); + + return 0; +} diff --git a/gcc/testsuite/gcc.target/i386/pr88828-1a.c b/gcc/testsuite/gcc.target/i386/pr88828-1a.c new file mode 100644 index 00000000000..d37b24c6661 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr88828-1a.c @@ -0,0 +1,17 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -msse -mno-sse4" } */ +/* { dg-final { scan-assembler "movss" } } */ +/* { dg-final { scan-assembler-not "movaps" } } */ +/* { dg-final { scan-assembler-not "movlhps" } } */ +/* { dg-final { scan-assembler-not "unpcklps" } } */ +/* { dg-final { scan-assembler-not "shufps" } } */ + +typedef float __v4sf __attribute__ ((__vector_size__ (16))); + +__attribute__((noinline, noclone)) +__v4sf +foo1 (__v4sf x, float f) +{ + __v4sf y = { f, x[1], x[2], x[3] }; + return y; +} diff --git a/gcc/testsuite/gcc.target/i386/pr88828-1b.c b/gcc/testsuite/gcc.target/i386/pr88828-1b.c new file mode 100644 index 00000000000..af4aced65f8 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr88828-1b.c @@ -0,0 +1,23 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -msse -mno-sse4" } */ +/* { dg-final { scan-assembler "movss" } } */ +/* { dg-final { scan-assembler-not "movaps" } } */ +/* { dg-final { scan-assembler-not "movlhps" } } */ +/* { dg-final { scan-assembler-not "unpcklps" } } */ +/* { dg-final { scan-assembler-not "shufps" } } */ + +typedef float __v4sf __attribute__ ((__vector_size__ (16))); + +static __v4sf +vector_init (float f0,float f1, float f2,float f3) +{ + __v4sf y = { f0, f1, f2, f3 }; + return y; +} + +__attribute__((noinline, noclone)) +__v4sf +foo2 (__v4sf x, float f) +{ + return vector_init (f, x[1], x[2], x[3]) ; +} diff --git a/gcc/testsuite/gcc.target/i386/pr88828-1c.c b/gcc/testsuite/gcc.target/i386/pr88828-1c.c new file mode 100644 index 00000000000..a117f3ec7b1 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr88828-1c.c @@ -0,0 +1,18 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -msse -mno-sse4" } */ +/* { dg-final { scan-assembler "movss" } } */ +/* { dg-final { scan-assembler-not "movaps" } } */ +/* { dg-final { scan-assembler-not "movlhps" } } */ +/* { dg-final { scan-assembler-not "unpcklps" } } */ +/* { dg-final { scan-assembler-not "shufps" } } */ + +typedef float __v4sf __attribute__ ((__vector_size__ (16))); + +__attribute__((noinline, noclone)) +__v4sf +foo3 (__v4sf x, float f) +{ + __v4sf y = x; + y[0] = f; + return y; +} diff --git a/gcc/testsuite/gcc.target/i386/pr88828-2.c b/gcc/testsuite/gcc.target/i386/pr88828-2.c new file mode 100644 index 00000000000..011fd486bb1 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr88828-2.c @@ -0,0 +1,51 @@ +/* { dg-do run { target sse2_runtime } } */ +/* { dg-options "-O2 -msse2" } */ + +#include "pr88828-2a.c" +#include "pr88828-2c.c" + +extern void abort (); + +void +do_check (__v4sf y, float f[4], float z) +{ + int i; + + for (i = 0; i < 4; i++) + if (i == 0) + { + if (y[i] != z) + abort (); + } + else if (i == 1) + { + if (y[i] != f[0]) + abort (); + } + else + { + if (y[i] != f[i]) + abort (); + } +} + +int +main (void) +{ + float f[4] = { -11, 2, 55553, -4 }; + float z = 134567; + __v4sf x = { f[0], f[1], f[2], f[3] }; + __v4sf y; + int i; + + for (i = 0; i < 4; i++) + if (x[i] != f[i]) + abort (); + + y = foo1 (x, z); + do_check (y, f, z); + y = foo2 (x, z); + do_check (y, f, z); + + return 0; +} diff --git a/gcc/testsuite/gcc.target/i386/pr88828-2a.c b/gcc/testsuite/gcc.target/i386/pr88828-2a.c new file mode 100644 index 00000000000..85e49535ebd --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr88828-2a.c @@ -0,0 +1,17 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -msse -mno-sse4" } */ +/* { dg-final { scan-assembler "movss" } } */ +/* { dg-final { scan-assembler-times "shufps" 1 } } */ +/* { dg-final { scan-assembler-not "movaps" } } */ +/* { dg-final { scan-assembler-not "movlhps" } } */ +/* { dg-final { scan-assembler-not "unpcklps" } } */ + +typedef float __v4sf __attribute__ ((__vector_size__ (16))); + +__attribute__((noinline, noclone)) +__v4sf +foo1 (__v4sf x, float f) +{ + __v4sf y = { f, x[0], x[2], x[3] }; + return y; +} diff --git a/gcc/testsuite/gcc.target/i386/pr88828-2b.c b/gcc/testsuite/gcc.target/i386/pr88828-2b.c new file mode 100644 index 00000000000..adfd7002a4d --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr88828-2b.c @@ -0,0 +1,19 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mavx" } */ +/* { dg-final { scan-assembler-times "vpermilps" 1 } } */ +/* { dg-final { scan-assembler-times "vmovss" 1 { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-times "vpinsrd" 1 { target ia32 } } } */ +/* { dg-final { scan-assembler-not "vmovss" { target ia32 } } } */ +/* { dg-final { scan-assembler-not "vmovaps" } } */ +/* { dg-final { scan-assembler-not "vmovlhps" } } */ +/* { dg-final { scan-assembler-not "vunpcklps" } } */ + +typedef float __v4sf __attribute__ ((__vector_size__ (16))); + +__attribute__((noinline, noclone)) +__v4sf +foo1 (__v4sf x, float f) +{ + __v4sf y = { f, x[0], x[2], x[3] }; + return y; +} diff --git a/gcc/testsuite/gcc.target/i386/pr88828-2c.c b/gcc/testsuite/gcc.target/i386/pr88828-2c.c new file mode 100644 index 00000000000..149967ea0b9 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr88828-2c.c @@ -0,0 +1,23 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -msse -mno-sse4" } */ +/* { dg-final { scan-assembler "movss" } } */ +/* { dg-final { scan-assembler-times "shufps" 1 } } */ +/* { dg-final { scan-assembler-not "movaps" } } */ +/* { dg-final { scan-assembler-not "movlhps" } } */ +/* { dg-final { scan-assembler-not "unpcklps" } } */ + +typedef float __v4sf __attribute__ ((__vector_size__ (16))); + +static __v4sf +vector_init (float f0,float f1, float f2,float f3) +{ + __v4sf y = { f0, f1, f2, f3 }; + return y; +} + +__attribute__((noinline, noclone)) +__v4sf +foo2 (__v4sf x, float f) +{ + return vector_init (f, x[0], x[2], x[3]) ; +} diff --git a/gcc/testsuite/gcc.target/i386/pr88828-2d.c b/gcc/testsuite/gcc.target/i386/pr88828-2d.c new file mode 100644 index 00000000000..21088496730 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr88828-2d.c @@ -0,0 +1,25 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mavx" } */ +/* { dg-final { scan-assembler-times "vpermilps" 1 } } */ +/* { dg-final { scan-assembler-times "vmovss" 1 { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-times "vpinsrd" 1 { target ia32 } } } */ +/* { dg-final { scan-assembler-not "vmovss" { target ia32 } } } */ +/* { dg-final { scan-assembler-not "vmovaps" } } */ +/* { dg-final { scan-assembler-not "vmovlhps" } } */ +/* { dg-final { scan-assembler-not "vunpcklps" } } */ + +typedef float __v4sf __attribute__ ((__vector_size__ (16))); + +static __v4sf +vector_init (float f0,float f1, float f2,float f3) +{ + __v4sf y = { f0, f1, f2, f3 }; + return y; +} + +__attribute__((noinline, noclone)) +__v4sf +foo (__v4sf x, float f) +{ + return vector_init (f, x[0], x[2], x[3]) ; +} diff --git a/gcc/testsuite/gcc.target/i386/pr88828-3.c b/gcc/testsuite/gcc.target/i386/pr88828-3.c new file mode 100644 index 00000000000..adbc46dbf3b --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr88828-3.c @@ -0,0 +1,54 @@ +/* { dg-do run { target sse2_runtime } } */ +/* { dg-options "-O2 -msse2" } */ + +#include "pr88828-3a.c" +#include "pr88828-3b.c" +#include "pr88828-3c.c" + +extern void abort (); + +void +do_check (__v4sf y, float f[4], float z) +{ + int i; + + for (i = 0; i < 4; i++) + if (i == 3) + { + if (y[i] != z) + abort (); + } + else if (i == 0) + { + if (y[i] != f[i]) + abort (); + } + else + { + if (y[i] != f[i + 1]) + abort (); + } +} + +int +main (void) +{ + float f[4] = { -11, 2, 55553, -4 }; + float z = 134567; + __v4sf x = { f[0], f[1], f[2], f[3] }; + __v4sf y; + int i; + + for (i = 0; i < 4; i++) + if (x[i] != f[i]) + abort (); + + y = foo1 (x, z); + do_check (y, f, z); + y = foo2 (x, z); + do_check (y, f, z); + y = foo3 (x, z); + do_check (y, f, z); + + return 0; +} diff --git a/gcc/testsuite/gcc.target/i386/pr88828-3a.c b/gcc/testsuite/gcc.target/i386/pr88828-3a.c new file mode 100644 index 00000000000..e5cb95c1275 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr88828-3a.c @@ -0,0 +1,17 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -msse -mno-sse4" } */ +/* { dg-final { scan-assembler "movss" } } */ +/* { dg-final { scan-assembler-times "shufps" 2 } } */ +/* { dg-final { scan-assembler-times "movaps" 1 } } */ +/* { dg-final { scan-assembler-not "movlhps" } } */ +/* { dg-final { scan-assembler-not "unpcklps" } } */ + +typedef float __v4sf __attribute__ ((__vector_size__ (16))); + +__attribute__((noinline, noclone)) +__v4sf +foo1 (__v4sf x, float f) +{ + __v4sf y = { x[0], x[2], x[3], f }; + return y; +} diff --git a/gcc/testsuite/gcc.target/i386/pr88828-3b.c b/gcc/testsuite/gcc.target/i386/pr88828-3b.c new file mode 100644 index 00000000000..0349f35b08a --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr88828-3b.c @@ -0,0 +1,19 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mavx" } */ +/* { dg-final { scan-assembler-times "vpermilps" 1 } } */ +/* { dg-final { scan-assembler-times "vinsertps" 1 } } */ +/* { dg-final { scan-assembler-not "vmovss" } } */ +/* { dg-final { scan-assembler-not "vshufps" } } */ +/* { dg-final { scan-assembler-not "vmovaps" } } */ +/* { dg-final { scan-assembler-not "vmovlhps" } } */ +/* { dg-final { scan-assembler-not "vunpcklps" } } */ + +typedef float __v4sf __attribute__ ((__vector_size__ (16))); + +__attribute__((noinline, noclone)) +__v4sf +foo2 (__v4sf x, float f) +{ + __v4sf y = { x[0], x[2], x[3], f }; + return y; +} diff --git a/gcc/testsuite/gcc.target/i386/pr88828-3c.c b/gcc/testsuite/gcc.target/i386/pr88828-3c.c new file mode 100644 index 00000000000..fb668a55f1d --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr88828-3c.c @@ -0,0 +1,25 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mavx" } */ +/* { dg-final { scan-assembler-times "vpermilps" 1 } } */ +/* { dg-final { scan-assembler-times "vinsertps" 1 } } */ +/* { dg-final { scan-assembler-not "vmovss" } } */ +/* { dg-final { scan-assembler-not "vshufps" } } */ +/* { dg-final { scan-assembler-not "vmovaps" } } */ +/* { dg-final { scan-assembler-not "vmovlhps" } } */ +/* { dg-final { scan-assembler-not "vunpcklps" } } */ + +typedef float __v4sf __attribute__ ((__vector_size__ (16))); + +static __v4sf +vector_init (float f0,float f1, float f2,float f3) +{ + __v4sf y = { f0, f1, f2, f3 }; + return y; +} + +__attribute__((noinline, noclone)) +__v4sf +foo3 (__v4sf x, float f) +{ + return vector_init (x[0], x[2], x[3], f); +} diff --git a/gcc/testsuite/gcc.target/i386/pr88828-4a.c b/gcc/testsuite/gcc.target/i386/pr88828-4a.c new file mode 100644 index 00000000000..64043b9855f --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr88828-4a.c @@ -0,0 +1,18 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -msse -mno-sse4" } */ +/* { dg-final { scan-assembler "movss" } } */ +/* { dg-final { scan-assembler-times "shufps" 1 } } */ +/* { dg-final { scan-assembler-not "movaps" } } */ +/* { dg-final { scan-assembler-not "movlhps" } } */ +/* { dg-final { scan-assembler-not "unpcklps" } } */ + +typedef float __v4sf __attribute__ ((__vector_size__ (16))); + +__attribute__((noinline, noclone)) +__v4sf +foo (__v4sf x, float f) +{ + __v4sf y = { x[0], x[2], x[3], x[1] }; + y[0] = f; + return y; +} diff --git a/gcc/testsuite/gcc.target/i386/pr88828-4b.c b/gcc/testsuite/gcc.target/i386/pr88828-4b.c new file mode 100644 index 00000000000..ad8d2b985d4 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr88828-4b.c @@ -0,0 +1,21 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mavx" } */ +/* { dg-final { scan-assembler-times "vpermilps" 1 } } */ +/* { dg-final { scan-assembler-times "vmovss" 1 { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-times "vpinsrd" 1 { target ia32 } } } */ +/* { dg-final { scan-assembler-not "vmovss" { target ia32 } } } */ +/* { dg-final { scan-assembler-not "vshufps" } } */ +/* { dg-final { scan-assembler-not "vmovaps" } } */ +/* { dg-final { scan-assembler-not "vmovlhps" } } */ +/* { dg-final { scan-assembler-not "vunpcklps" } } */ + +typedef float __v4sf __attribute__ ((__vector_size__ (16))); + +__attribute__((noinline, noclone)) +__v4sf +foo (__v4sf x, float f) +{ + __v4sf y = { x[0], x[2], x[3], x[1] }; + y[0] = f; + return y; +} diff --git a/gcc/testsuite/gcc.target/i386/pr88828-5a.c b/gcc/testsuite/gcc.target/i386/pr88828-5a.c new file mode 100644 index 00000000000..5e908faef5c --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr88828-5a.c @@ -0,0 +1,18 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -msse -mno-sse4" } */ +/* { dg-final { scan-assembler "movss" } } */ +/* { dg-final { scan-assembler-times "shufps" 2 } } */ +/* { dg-final { scan-assembler-times "movaps" 1 } } */ +/* { dg-final { scan-assembler-not "movlhps" } } */ +/* { dg-final { scan-assembler-not "unpcklps" } } */ + +typedef float __v4sf __attribute__ ((__vector_size__ (16))); + +__attribute__((noinline, noclone)) +__v4sf +foo (__v4sf x, float f) +{ + __v4sf y = { x[0], x[2], x[3], x[0] }; + y[3] = f; + return y; +} diff --git a/gcc/testsuite/gcc.target/i386/pr88828-5b.c b/gcc/testsuite/gcc.target/i386/pr88828-5b.c new file mode 100644 index 00000000000..988a48823e6 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr88828-5b.c @@ -0,0 +1,20 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mavx" } */ +/* { dg-final { scan-assembler-times "vpermilps" 1 } } */ +/* { dg-final { scan-assembler-times "vinsertps" 1 } } */ +/* { dg-final { scan-assembler-not "vshufps" } } */ +/* { dg-final { scan-assembler-not "vmovss" } } */ +/* { dg-final { scan-assembler-not "vmovaps" } } */ +/* { dg-final { scan-assembler-not "vmovlhps" } } */ +/* { dg-final { scan-assembler-not "vunpcklps" } } */ + +typedef float __v4sf __attribute__ ((__vector_size__ (16))); + +__attribute__((noinline, noclone)) +__v4sf +foo (__v4sf x, float f) +{ + __v4sf y = { x[0], x[2], x[3], x[0] }; + y[3] = f; + return y; +} diff --git a/gcc/testsuite/gcc.target/i386/pr88828-6.c b/gcc/testsuite/gcc.target/i386/pr88828-6.c new file mode 100644 index 00000000000..8d920396896 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr88828-6.c @@ -0,0 +1,47 @@ +/* { dg-do run { target sse2_runtime } } */ +/* { dg-options "-O2 -msse2" } */ + +#include "pr88828-6a.c" +#include "pr88828-6b.c" + +extern void abort (); + +void +do_check (__v4sf x, float f1[4], float f2[4]) +{ + int i; + + for (i = 0; i < 4; i++) + if (i == 0) + { + if (x[i] != (f1[i] + f2[i])) + abort (); + } + else + { + if (x[i] != f1[i]) + abort (); + } +} + +int +main (void) +{ + float f1[4] = { -11, 2, 55553, -4 }; + float f2[4] = { 111, 3.3, -55.553, 4.8 }; + __v4sf x = { f1[0], f1[1], f1[2], f1[3] }; + __v4sf y = { f2[0], f2[1], f2[2], f2[3] }; + __v4sf z; + int i; + + for (i = 0; i < 4; i++) + if (x[i] != f1[i] || y[i] != f2[i] ) + abort (); + + z = foo1 (x, y); + do_check (z, f1, f2); + x = foo2 (x, y); + do_check (z, f1, f2); + + return 0; +} diff --git a/gcc/testsuite/gcc.target/i386/pr88828-6a.c b/gcc/testsuite/gcc.target/i386/pr88828-6a.c new file mode 100644 index 00000000000..4094f25a1fb --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr88828-6a.c @@ -0,0 +1,16 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -msse -mno-sse4" } */ +/* { dg-final { scan-assembler "addss" } } */ +/* { dg-final { scan-assembler-not "movlhps" } } */ +/* { dg-final { scan-assembler-not "unpckhps" } } */ +/* { dg-final { scan-assembler-not "unpcklps" } } */ +/* { dg-final { scan-assembler-not "shufps" } } */ + +typedef float __v4sf __attribute__ ((__vector_size__ (16))); + +__v4sf +foo1 (__v4sf x, __v4sf y) +{ + __v4sf z = { x[0] + y[0], x[1], x[2], x[3] }; + return z; +} diff --git a/gcc/testsuite/gcc.target/i386/pr88828-6b.c b/gcc/testsuite/gcc.target/i386/pr88828-6b.c new file mode 100644 index 00000000000..a423a089963 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr88828-6b.c @@ -0,0 +1,22 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -msse -mno-sse4" } */ +/* { dg-final { scan-assembler "addss" } } */ +/* { dg-final { scan-assembler-not "movlhps" } } */ +/* { dg-final { scan-assembler-not "unpckhps" } } */ +/* { dg-final { scan-assembler-not "unpcklps" } } */ +/* { dg-final { scan-assembler-not "shufps" } } */ + +typedef float __v4sf __attribute__ ((__vector_size__ (16))); + +static __v4sf +vector_init (float f0,float f1, float f2,float f3) +{ + __v4sf y = { f0, f1, f2, f3 }; + return y; +} + +__v4sf +foo2 (__v4sf x, __v4sf y) +{ + return vector_init (x[0] + y[0], x[1], x[2], x[3]) ; +} diff --git a/gcc/testsuite/gcc.target/i386/pr88828-7.c b/gcc/testsuite/gcc.target/i386/pr88828-7.c new file mode 100644 index 00000000000..471028d417d --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr88828-7.c @@ -0,0 +1,53 @@ +/* { dg-do run { target sse2_runtime } } */ +/* { dg-options "-O2 -msse2" } */ + +#include "pr88828-7a.c" +#include "pr88828-7b.c" + +extern void abort (); + +float +bar (float x, float y) +{ + return x / y - y * x; +} + +void +do_check (__v4sf x, float f1[4], float f2[4]) +{ + int i; + + for (i = 0; i < 4; i++) + if (i == 0) + { + if (x[i] != bar (f1[i], f2[i])) + abort (); + } + else + { + if (x[i] != f1[i]) + abort (); + } +} + +int +main (void) +{ + float f1[4] = { -11, 2, 55553, -4 }; + float f2[4] = { 111, 3.3, -55.553, 4.8 }; + __v4sf x = { f1[0], f1[1], f1[2], f1[3] }; + __v4sf y = { f2[0], f2[1], f2[2], f2[3] }; + __v4sf z; + int i; + + for (i = 0; i < 4; i++) + if (x[i] != f1[i] || y[i] != f2[i] ) + abort (); + + z = foo1 (x, y); + do_check (z, f1, f2); + x = foo2 (x, y); + do_check (z, f1, f2); + + return 0; +} diff --git a/gcc/testsuite/gcc.target/i386/pr88828-7a.c b/gcc/testsuite/gcc.target/i386/pr88828-7a.c new file mode 100644 index 00000000000..f1ae57422d9 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr88828-7a.c @@ -0,0 +1,16 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -msse -mno-sse4" } */ +/* { dg-final { scan-assembler-not "movlhps" } } */ +/* { dg-final { scan-assembler-not "unpckhps" } } */ +/* { dg-final { scan-assembler-not "unpcklps" } } */ +/* { dg-final { scan-assembler-not "shufps" } } */ + +typedef float __v4sf __attribute__ ((__vector_size__ (16))); +extern float bar (float, float); + +__v4sf +foo1 (__v4sf x, __v4sf y) +{ + __v4sf z = { bar (x[0], y[0]), x[1], x[2], x[3] }; + return z; +} diff --git a/gcc/testsuite/gcc.target/i386/pr88828-7b.c b/gcc/testsuite/gcc.target/i386/pr88828-7b.c new file mode 100644 index 00000000000..c027c56948d --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr88828-7b.c @@ -0,0 +1,22 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -msse -mno-sse4" } */ +/* { dg-final { scan-assembler-not "movlhps" } } */ +/* { dg-final { scan-assembler-not "unpckhps" } } */ +/* { dg-final { scan-assembler-not "unpcklps" } } */ +/* { dg-final { scan-assembler-not "shufps" } } */ + +typedef float __v4sf __attribute__ ((__vector_size__ (16))); +extern float bar (float, float); + +static __v4sf +vector_init (float f0,float f1, float f2,float f3) +{ + __v4sf y = { f0, f1, f2, f3 }; + return y; +} + +__v4sf +foo2 (__v4sf x, __v4sf y) +{ + return vector_init (bar (x[0], y[0]), x[1], x[2], x[3]) ; +} diff --git a/gcc/testsuite/gcc.target/i386/pr88828-8.c b/gcc/testsuite/gcc.target/i386/pr88828-8.c new file mode 100644 index 00000000000..3b8eabd225f --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr88828-8.c @@ -0,0 +1,46 @@ +/* { dg-do run { target sse2_runtime } } */ +/* { dg-options "-O2 -msse2" } */ + +#include "pr88828-8a.c" +#include "pr88828-8b.c" + +extern void abort (); + +void +do_check (__v4sf y, float f[4], float z) +{ + int i; + + for (i = 0; i < 4; i++) + if (i == 0) + { + if (y[i] != z) + abort (); + } + else + { + if (y[i] != f[i]) + abort (); + } +} + +int +main (void) +{ + float f[4] = { -11, 2, 55553, -4 }; + float z = 11.4; + __v4sf x = { f[0], f[1], f[2], f[3] }; + __v4sf y; + int i; + + for (i = 0; i < 4; i++) + if (x[i] != f[i]) + abort (); + + y = foo1 (x); + do_check (y, f, z); + y = foo2 (x); + do_check (y, f, z); + + return 0; +} diff --git a/gcc/testsuite/gcc.target/i386/pr88828-8a.c b/gcc/testsuite/gcc.target/i386/pr88828-8a.c new file mode 100644 index 00000000000..5d383dfd081 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr88828-8a.c @@ -0,0 +1,15 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -msse -mno-sse4" } */ +/* { dg-final { scan-assembler-not "movlhps" } } */ +/* { dg-final { scan-assembler-not "unpckhps" } } */ +/* { dg-final { scan-assembler-not "unpcklps" } } */ +/* { dg-final { scan-assembler-not "shufps" } } */ + +typedef float __v4sf __attribute__ ((__vector_size__ (16))); + +__v4sf +foo1 (__v4sf x) +{ + __v4sf z = { 11.4, x[1], x[2], x[3] }; + return z; +} diff --git a/gcc/testsuite/gcc.target/i386/pr88828-8b.c b/gcc/testsuite/gcc.target/i386/pr88828-8b.c new file mode 100644 index 00000000000..5ffbc9c3103 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr88828-8b.c @@ -0,0 +1,21 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -msse -mno-sse4" } */ +/* { dg-final { scan-assembler-not "movlhps" } } */ +/* { dg-final { scan-assembler-not "unpckhps" } } */ +/* { dg-final { scan-assembler-not "unpcklps" } } */ +/* { dg-final { scan-assembler-not "shufps" } } */ + +typedef float __v4sf __attribute__ ((__vector_size__ (16))); + +static __v4sf +vector_init (float f0,float f1, float f2,float f3) +{ + __v4sf y = { f0, f1, f2, f3 }; + return y; +} + +__v4sf +foo2 (__v4sf x) +{ + return vector_init (11.4, x[1], x[2], x[3]) ; +} diff --git a/gcc/tree-ssa-forwprop.c b/gcc/tree-ssa-forwprop.c index eeb6281c652..f40eaffdcc1 100644 --- a/gcc/tree-ssa-forwprop.c +++ b/gcc/tree-ssa-forwprop.c @@ -2008,7 +2008,7 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi) unsigned elem_size, i; unsigned HOST_WIDE_INT nelts; enum tree_code code, conv_code; - constructor_elt *elt; + constructor_elt *ce; bool maybe_ident; gcc_checking_assert (gimple_assign_rhs_code (stmt) == CONSTRUCTOR); @@ -2027,18 +2027,38 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi) orig[1] = NULL; conv_code = ERROR_MARK; maybe_ident = true; - FOR_EACH_VEC_SAFE_ELT (CONSTRUCTOR_ELTS (op), i, elt) + + tree rhs_vector = NULL; + /* The single scalar element. */ + tree scalar_element = NULL; + unsigned int scalar_idx = 0; + bool insert = false; + unsigned int nscalars = 0; + unsigned int nvectors = 0; + FOR_EACH_VEC_SAFE_ELT (CONSTRUCTOR_ELTS (op), i, ce) { tree ref, op1; if (i >= nelts) return false; - if (TREE_CODE (elt->value) != SSA_NAME) - return false; - def_stmt = get_prop_source_stmt (elt->value, false, NULL); + if (TREE_CODE (ce->value) == SSA_NAME) + def_stmt = get_prop_source_stmt (ce->value, false, NULL); + else + def_stmt = NULL; if (!def_stmt) - return false; + { + /* Only allow one scalar insert. */ + if (nscalars != 0) + return false; + + nscalars = 1; + insert = true; + scalar_idx = i; + sel.quick_push (i); + scalar_element = ce->value; + continue; + } code = gimple_assign_rhs_code (def_stmt); if (code == FLOAT_EXPR || code == FIX_TRUNC_EXPR) @@ -2046,7 +2066,7 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi) op1 = gimple_assign_rhs1 (def_stmt); if (conv_code == ERROR_MARK) { - if (maybe_ne (GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (elt->value))), + if (maybe_ne (GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (ce->value))), GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (op1))))) return false; conv_code = code; @@ -2061,7 +2081,18 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi) code = gimple_assign_rhs_code (def_stmt); } if (code != BIT_FIELD_REF) - return false; + { + /* Only allow one scalar insert. */ + if (nscalars != 0) + return false; + + nscalars = 1; + insert = true; + scalar_idx = i; + sel.quick_push (i); + scalar_element = ce->value; + continue; + } op1 = gimple_assign_rhs1 (def_stmt); ref = TREE_OPERAND (op1, 0); unsigned int j; @@ -2095,11 +2126,29 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi) elt += nelts; if (elt != i) maybe_ident = false; + + if (type == TREE_TYPE (ref)) + { + /* The RHS vector has the same type as LHS. */ + if (rhs_vector == NULL) + rhs_vector = ref; + /* Check if all RHS vector elements come fome the same + vector. */ + if (rhs_vector == ref) + nvectors++; + } + sel.quick_push (elt); } if (i < nelts) return false; + if (insert + && (nvectors == 0 + || (TYPE_VECTOR_SUBPARTS (type).to_constant () + != (nscalars + nvectors)))) + return false; + if (! VECTOR_TYPE_P (TREE_TYPE (orig[0])) || maybe_ne (TYPE_VECTOR_SUBPARTS (type), TYPE_VECTOR_SUBPARTS (TREE_TYPE (orig[0])))) @@ -2127,18 +2176,26 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi) vec_perm_indices indices (sel, orig[1] ? 2 : 1, nelts); if (!can_vec_perm_const_p (TYPE_MODE (type), indices)) - return false; + { + if (insert) + gcc_unreachable (); + return false; + } mask_type = build_vector_type (build_nonstandard_integer_type (elem_size, 1), nelts); if (GET_MODE_CLASS (TYPE_MODE (mask_type)) != MODE_VECTOR_INT || maybe_ne (GET_MODE_SIZE (TYPE_MODE (mask_type)), GET_MODE_SIZE (TYPE_MODE (type)))) - return false; + { + if (insert) + gcc_unreachable (); + return false; + } op2 = vec_perm_indices_to_tree (mask_type, indices); if (!orig[1]) orig[1] = orig[0]; - if (conv_code == ERROR_MARK) + if (conv_code == ERROR_MARK && !insert) gimple_assign_set_rhs_with_ops (gsi, VEC_PERM_EXPR, orig[0], orig[1], op2); else @@ -2148,10 +2205,25 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi) VEC_PERM_EXPR, orig[0], orig[1], op2); orig[0] = gimple_assign_lhs (perm); gsi_insert_before (gsi, perm, GSI_SAME_STMT); - gimple_assign_set_rhs_with_ops (gsi, conv_code, orig[0], + gimple_assign_set_rhs_with_ops (gsi, + (conv_code != ERROR_MARK + ? conv_code + : NOP_EXPR), + orig[0], NULL_TREE, NULL_TREE); } } + if (insert) + { + /* Generate a single scalar insert. */ + tree var = make_ssa_name (type); + gimple *copy = gimple_copy (gsi_stmt (*gsi)); + gimple_assign_set_lhs (copy, var); + gsi_insert_before (gsi, copy, GSI_SAME_STMT); + tree bitpos = bitsize_int (scalar_idx * elem_size); + gimple_assign_set_rhs_with_ops (gsi, BIT_INSERT_EXPR, var, + scalar_element, bitpos); + } update_stmt (gsi_stmt (*gsi)); return true; } -- 2.20.1