The following makes sure to use a VEC_PERM SLP node to produce lane duplications for non-grouped SLP loads as those are later not lowered by load permutation lowering.
For some reason gcc.dg/vect/pr106081.c now fails permute optimizing, in particular eliding vector reversal for the reduction. Bootstrapped on x86_64-unknown-linux-gnu, testing in progress. * tree-vect-slp.cc (vect_build_slp_tree_2): Use a VEC_PERM SLP node to duplicate lanes for non-grouped loads. * gcc.dg/vect/pr106081.c: Adjust. --- gcc/testsuite/gcc.dg/vect/pr106081.c | 2 +- gcc/tree-vect-slp.cc | 38 +++++++++++++++++++++++++++- 2 files changed, 38 insertions(+), 2 deletions(-) diff --git a/gcc/testsuite/gcc.dg/vect/pr106081.c b/gcc/testsuite/gcc.dg/vect/pr106081.c index 8f97af2d642..1864320c803 100644 --- a/gcc/testsuite/gcc.dg/vect/pr106081.c +++ b/gcc/testsuite/gcc.dg/vect/pr106081.c @@ -30,4 +30,4 @@ test(double *k) } /* { dg-final { scan-tree-dump "vectorized 1 loops" "vect" } } */ -/* { dg-final { scan-tree-dump-times "VEC_PERM" 4 "optimized" { target x86_64-*-* i?86-*-* } } } */ +/* { dg-final { scan-tree-dump-times "VEC_PERM" 5 "optimized" { target x86_64-*-* i?86-*-* } } } */ diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc index af00c5e35dd..b34064103bd 100644 --- a/gcc/tree-vect-slp.cc +++ b/gcc/tree-vect-slp.cc @@ -2088,7 +2088,43 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node, } else { - SLP_TREE_LOAD_PERMUTATION (node) = load_permutation; + if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)) + { + /* Do not use SLP_TREE_LOAD_PERMUTATION for non-grouped + accesses. Instead when duplicated to so via a + VEC_PERM node. */ + if (!any_permute) + load_permutation.release (); + else + { + gcc_assert (group_size != 1); + vec<stmt_vec_info> stmts2; + stmts2.create (1); + stmts2.quick_push (stmt_info); + bool matches2; + slp_tree unperm_load + = vect_build_slp_tree (vinfo, stmts2, 1, + &this_max_nunits, &matches2, + limit, &this_tree_size, bst_map); + gcc_assert (unperm_load); + lane_permutation_t lperm; + lperm.create (group_size); + for (unsigned j = 0; j < load_permutation.length (); ++j) + { + gcc_assert (load_permutation[j] == 0); + lperm.quick_push (std::make_pair (0, 0)); + } + SLP_TREE_CODE (node) = VEC_PERM_EXPR; + SLP_TREE_CHILDREN (node).safe_push (unperm_load); + SLP_TREE_LANE_PERMUTATION (node) = lperm; + load_permutation.release (); + *max_nunits = this_max_nunits; + (*tree_size)++; + return node; + } + } + else + SLP_TREE_LOAD_PERMUTATION (node) = load_permutation; return node; } } -- 2.43.0