The following makes sure to use a VEC_PERM SLP node to produce
lane duplications for non-grouped SLP loads as those are later
not lowered by load permutation lowering.

For some reason gcc.dg/vect/pr106081.c now fails permute optimizing,
in particular eliding vector reversal for the reduction.

Bootstrapped on x86_64-unknown-linux-gnu, testing in progress.

        * tree-vect-slp.cc (vect_build_slp_tree_2): Use a VEC_PERM
        SLP node to duplicate lanes for non-grouped loads.

        * gcc.dg/vect/pr106081.c: Adjust.
---
 gcc/testsuite/gcc.dg/vect/pr106081.c |  2 +-
 gcc/tree-vect-slp.cc                 | 38 +++++++++++++++++++++++++++-
 2 files changed, 38 insertions(+), 2 deletions(-)

diff --git a/gcc/testsuite/gcc.dg/vect/pr106081.c 
b/gcc/testsuite/gcc.dg/vect/pr106081.c
index 8f97af2d642..1864320c803 100644
--- a/gcc/testsuite/gcc.dg/vect/pr106081.c
+++ b/gcc/testsuite/gcc.dg/vect/pr106081.c
@@ -30,4 +30,4 @@ test(double *k)
 }
 
 /* { dg-final { scan-tree-dump "vectorized 1 loops" "vect" } } */
-/* { dg-final { scan-tree-dump-times "VEC_PERM" 4 "optimized" { target 
x86_64-*-* i?86-*-* } } } */
+/* { dg-final { scan-tree-dump-times "VEC_PERM" 5 "optimized" { target 
x86_64-*-* i?86-*-* } } } */
diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index af00c5e35dd..b34064103bd 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -2088,7 +2088,43 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
            }
          else
            {
-             SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
+             if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
+               {
+                 /* Do not use SLP_TREE_LOAD_PERMUTATION for non-grouped
+                    accesses.  Instead when duplicated to so via a
+                    VEC_PERM node.  */
+                 if (!any_permute)
+                   load_permutation.release ();
+                 else
+                   {
+                     gcc_assert (group_size != 1);
+                     vec<stmt_vec_info> stmts2;
+                     stmts2.create (1);
+                     stmts2.quick_push (stmt_info);
+                     bool matches2;
+                     slp_tree unperm_load
+                       = vect_build_slp_tree (vinfo, stmts2, 1,
+                                              &this_max_nunits, &matches2,
+                                              limit, &this_tree_size, bst_map);
+                     gcc_assert (unperm_load);
+                     lane_permutation_t lperm;
+                     lperm.create (group_size);
+                     for (unsigned j = 0; j < load_permutation.length (); ++j)
+                       {
+                         gcc_assert (load_permutation[j] == 0);
+                         lperm.quick_push (std::make_pair (0, 0));
+                       }
+                     SLP_TREE_CODE (node) = VEC_PERM_EXPR;
+                     SLP_TREE_CHILDREN (node).safe_push (unperm_load);
+                     SLP_TREE_LANE_PERMUTATION (node) = lperm;
+                     load_permutation.release ();
+                     *max_nunits = this_max_nunits;
+                     (*tree_size)++;
+                     return node;
+                   }
+               }
+             else
+               SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
              return node;
            }
        }
-- 
2.43.0

Reply via email to