[PATCH] Support vectorizing SLP with mixed plus/minus

Richard Biener Tue, 12 May 2015 04:49:07 -0700

The following makes it possible to vectorize complex multiplication
without unrolling by allowing mixed operations in a SLP node
(plus/minus for now).  We vectorize that by computing both plus
and minus and then merge the result.


Something needs to be done in the backends to fully leverage this
as for x86_64 for example the addsub patterns match vec_merge but
the permutes only use vec_merge for V4SF not for V2DF.  We have too
many ways to express this kind of permutation (vec_perm, vec_merge
and also vec_select (vec_concat ())).

This is not a full enablement to vectorize the loop in PR37021,
some pieces are still missing (I'm working step-by-step here, resurrecting
old patches).

Bootstrapped and tested on x86_64-unknown-linux-gnu, applied to trunk.

Richard.

2015-05-12  Richard Biener  <rguent...@suse.de>

        PR tree-optimization/37021
        * tree-vectorizer.h (struct _slp_tree): Add two_operators flag.
        (SLP_TREE_TWO_OPERATORS): New define.
        * tree-vect-slp.c (vect_create_new_slp_node): Initialize
        SLP_TREE_TWO_OPERATORS.
        (vect_build_slp_tree_1): Allow two mixing plus/minus in an
        SLP node.
        (vect_build_slp_tree): Adjust.
        (vect_analyze_slp_cost_1): Likewise.
        (vect_schedule_slp_instance): Vectorize mixing plus/minus by
        emitting two vector stmts and mixing the results.

        * gcc.target/i386/vect-addsub.c: New testcase.

Index: gcc/tree-vect-slp.c
===================================================================
*** gcc/tree-vect-slp.c.orig    2015-05-08 13:12:21.449383105 +0200
--- gcc/tree-vect-slp.c 2015-05-12 09:55:03.431066278 +0200
*************** vect_create_new_slp_node (vec<gimple> sc
*** 160,165 ****
--- 160,166 ----
    SLP_TREE_VEC_STMTS (node).create (0);
    SLP_TREE_CHILDREN (node).create (nops);
    SLP_TREE_LOAD_PERMUTATION (node) = vNULL;
+   SLP_TREE_TWO_OPERATORS (node) = false;
  
    return node;
  }
*************** static bool
*** 472,482 ****
  vect_build_slp_tree_1 (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
                       vec<gimple> stmts, unsigned int group_size,
                       unsigned nops, unsigned int *max_nunits,
!                      unsigned int vectorization_factor, bool *matches)
  {
    unsigned int i;
!   gimple stmt = stmts[0];
!   enum tree_code first_stmt_code = ERROR_MARK, rhs_code = ERROR_MARK;
    enum tree_code first_cond_code = ERROR_MARK;
    tree lhs;
    bool need_same_oprnds = false;
--- 473,486 ----
  vect_build_slp_tree_1 (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
                       vec<gimple> stmts, unsigned int group_size,
                       unsigned nops, unsigned int *max_nunits,
!                      unsigned int vectorization_factor, bool *matches,
!                      bool *two_operators)
  {
    unsigned int i;
!   gimple first_stmt = stmts[0], stmt = stmts[0];
!   enum tree_code first_stmt_code = ERROR_MARK;
!   enum tree_code alt_stmt_code = ERROR_MARK;
!   enum tree_code rhs_code = ERROR_MARK;
    enum tree_code first_cond_code = ERROR_MARK;
    tree lhs;
    bool need_same_oprnds = false;
*************** vect_build_slp_tree_1 (loop_vec_info loo
*** 662,671 ****
--- 666,685 ----
        else
        {
          if (first_stmt_code != rhs_code
+             && alt_stmt_code == ERROR_MARK)
+           alt_stmt_code = rhs_code;
+         if (first_stmt_code != rhs_code
              && (first_stmt_code != IMAGPART_EXPR
                  || rhs_code != REALPART_EXPR)
              && (first_stmt_code != REALPART_EXPR
                  || rhs_code != IMAGPART_EXPR)
+             /* Handle mismatches in plus/minus by computing both
+                and merging the results.  */
+             && !((first_stmt_code == PLUS_EXPR
+                   || first_stmt_code == MINUS_EXPR)
+                  && (alt_stmt_code == PLUS_EXPR
+                      || alt_stmt_code == MINUS_EXPR)
+                  && rhs_code == alt_stmt_code)
                && !(STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))
                     && (first_stmt_code == ARRAY_REF
                         || first_stmt_code == BIT_FIELD_REF
*************** vect_build_slp_tree_1 (loop_vec_info loo
*** 679,685 ****
                                   "Build SLP failed: different operation "
                                   "in stmt ");
                  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
!                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
                }
              /* Mismatch.  */
              continue;
--- 693,702 ----
                                   "Build SLP failed: different operation "
                                   "in stmt ");
                  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
!                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
!                                  "original stmt ");
!                 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
!                                   first_stmt, 0);
                }
              /* Mismatch.  */
              continue;
*************** vect_build_slp_tree_1 (loop_vec_info loo
*** 916,921 ****
--- 933,975 ----
      if (!matches[i])
        return false;
  
+   /* If we allowed a two-operation SLP node verify the target can cope
+      with the permute we are going to use.  */
+   if (alt_stmt_code != ERROR_MARK
+       && TREE_CODE_CLASS (alt_stmt_code) != tcc_reference)
+     {
+       unsigned char *sel
+       = XALLOCAVEC (unsigned char, TYPE_VECTOR_SUBPARTS (vectype));
+       for (i = 0; i < TYPE_VECTOR_SUBPARTS (vectype); ++i)
+       {
+         sel[i] = i;
+         if (gimple_assign_rhs_code (stmts[i % group_size]) == alt_stmt_code)
+           sel[i] += TYPE_VECTOR_SUBPARTS (vectype);
+       }
+       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
+       {
+         for (i = 0; i < group_size; ++i)
+           if (gimple_assign_rhs_code (stmts[i]) == alt_stmt_code)
+             {
+               matches[i] = false;
+               if (dump_enabled_p ())
+                 {
+                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                                    "Build SLP failed: different operation "
+                                    "in stmt ");
+                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
+                                     stmts[i], 0);
+                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                                    "original stmt ");
+                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
+                                     first_stmt, 0);
+                 }
+             }
+         return false;
+       }
+       *two_operators = true;
+     }
+ 
    return true;
  }
  
*************** vect_build_slp_tree (loop_vec_info loop_
*** 952,961 ****
    else
      return false;
  
    if (!vect_build_slp_tree_1 (loop_vinfo, bb_vinfo,
                              SLP_TREE_SCALAR_STMTS (*node), group_size, nops,
!                             max_nunits, vectorization_factor, matches))
      return false;
  
    /* If the SLP node is a load, terminate the recursion.  */
    if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))
--- 1006,1018 ----
    else
      return false;
  
+   bool two_operators = false;
    if (!vect_build_slp_tree_1 (loop_vinfo, bb_vinfo,
                              SLP_TREE_SCALAR_STMTS (*node), group_size, nops,
!                             max_nunits, vectorization_factor, matches,
!                             &two_operators))
      return false;
+   SLP_TREE_TWO_OPERATORS (*node) = two_operators;
  
    /* If the SLP node is a load, terminate the recursion.  */
    if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))
*************** vect_analyze_slp_cost_1 (loop_vec_info l
*** 1514,1521 ****
        }
      }
    else
!     record_stmt_cost (body_cost_vec, ncopies_for_cost, vector_stmt,
!                     stmt_info, 0, vect_body);
  
    /* Scan operands and account for prologue cost of constants/externals.
       ???  This over-estimates cost for multiple uses and should be
--- 1571,1587 ----
        }
      }
    else
!     {
!       record_stmt_cost (body_cost_vec, ncopies_for_cost, vector_stmt,
!                       stmt_info, 0, vect_body);
!       if (SLP_TREE_TWO_OPERATORS (node))
!       {
!         record_stmt_cost (body_cost_vec, ncopies_for_cost, vector_stmt,
!                           stmt_info, 0, vect_body);
!         record_stmt_cost (body_cost_vec, ncopies_for_cost, vec_perm,
!                           stmt_info, 0, vect_body);
!       }
!     }
  
    /* Scan operands and account for prologue cost of constants/externals.
       ???  This over-estimates cost for multiple uses and should be
*************** vect_schedule_slp_instance (slp_tree nod
*** 3347,3352 ****
--- 3413,3486 ----
        STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
      }
  
+   /* Handle two-operation SLP nodes by vectorizing the group with
+      both operations and then performing a merge.  */
+   if (SLP_TREE_TWO_OPERATORS (node))
+     {
+       enum tree_code code0 = gimple_assign_rhs_code (stmt);
+       enum tree_code ocode;
+       gimple ostmt;
+       unsigned char *mask = XALLOCAVEC (unsigned char, group_size);
+       bool allsame = true;
+       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, ostmt)
+       if (gimple_assign_rhs_code (ostmt) != code0)
+         {
+           mask[i] = 1;
+           allsame = false;
+           ocode = gimple_assign_rhs_code (ostmt);
+         }
+       else
+         mask[i] = 0;
+       if (!allsame)
+       {
+         vec<gimple> v0;
+         vec<gimple> v1;
+         unsigned j;
+         tree tmask = NULL_TREE;
+         vect_transform_stmt (stmt, &si, &grouped_store, node, instance);
+         v0 = SLP_TREE_VEC_STMTS (node).copy ();
+         SLP_TREE_VEC_STMTS (node).truncate (0);
+         gimple_assign_set_rhs_code (stmt, ocode);
+         vect_transform_stmt (stmt, &si, &grouped_store, node, instance);
+         gimple_assign_set_rhs_code (stmt, code0);
+         v1 = SLP_TREE_VEC_STMTS (node).copy ();
+         SLP_TREE_VEC_STMTS (node).truncate (0);
+         tree meltype = build_nonstandard_integer_type
+             (GET_MODE_BITSIZE (TYPE_MODE (TREE_TYPE (vectype))), 1);
+         tree mvectype = get_same_sized_vectype (meltype, vectype);
+         unsigned k = 0, l;
+         for (j = 0; j < v0.length (); ++j)
+           {
+             tree *melts = XALLOCAVEC (tree, TYPE_VECTOR_SUBPARTS (vectype));
+             for (l = 0; l < TYPE_VECTOR_SUBPARTS (vectype); ++l)
+               {
+                 if (k > group_size)
+                   k = 0;
+                 melts[l] = build_int_cst
+                     (meltype, mask[k++] * TYPE_VECTOR_SUBPARTS (vectype) + l);
+               }
+             tmask = build_vector (mvectype, melts);
+ 
+             /* ???  Not all targets support a VEC_PERM_EXPR with a
+                constant mask that would translate to a vec_merge RTX
+                (with their vec_perm_const_ok).  We can either not
+                vectorize in that case or let veclower do its job.
+                Unfortunately that isn't too great and at least for
+                plus/minus we'd eventually like to match targets
+                vector addsub instructions.  */
+             gimple vstmt;
+             vstmt = gimple_build_assign (make_ssa_name (vectype),
+                                          VEC_PERM_EXPR,
+                                          gimple_assign_lhs (v0[j]),
+                                          gimple_assign_lhs (v1[j]), tmask);
+             vect_finish_stmt_generation (stmt, vstmt, &si);
+             SLP_TREE_VEC_STMTS (node).quick_push (vstmt);
+           }
+         v0.release ();
+         v1.release ();
+         return false;
+       }
+     }
    is_store = vect_transform_stmt (stmt, &si, &grouped_store, node, instance);
    return is_store;
  }
Index: gcc/testsuite/gcc.target/i386/vect-addsub.c
===================================================================
*** /dev/null   1970-01-01 00:00:00.000000000 +0000
--- gcc/testsuite/gcc.target/i386/vect-addsub.c 2015-05-11 13:56:07.958892513 
+0200
***************
*** 0 ****
--- 1,22 ----
+ /* { dg-do compile } */
+ /* { dg-options "-O2 -ftree-vectorize -msse4 -mtune=generic" } */
+ 
+ /* We need SSE4 so the backend recognizes a { 0, 5, 2, 7 } constant
+    permutation as supported as the vectorizer wants to generate
+ 
+      vect__6.10_24 = vect__3.6_20 - vect__5.9_23;
+      vect__6.11_25 = vect__3.6_20 + vect__5.9_23;
+      _26 = VEC_PERM_EXPR <vect__6.10_24, vect__6.11_25, { 0, 5, 2, 7 }>;
+ 
+    See also the ??? comment about using and/andn/or in expand_vec_perm_blend
+    for non-SSE4 targets.  */
+ 
+ void testf (float * __restrict__ p, float * __restrict q)
+ {
+   p[0] = p[0] - q[0];
+   p[1] = p[1] + q[1];
+   p[2] = p[2] - q[2];
+   p[3] = p[3] + q[3];
+ }
+ 
+ /* { dg-final { scan-assembler "addsubps" } } */
Index: gcc/tree-vectorizer.h
===================================================================
*** gcc/tree-vectorizer.h.orig  2015-04-28 09:59:32.259075685 +0200
--- gcc/tree-vectorizer.h       2015-05-11 13:29:04.889568537 +0200
*************** struct _slp_tree {
*** 111,116 ****
--- 111,118 ----
       scalar elements in one scalar iteration (GROUP_SIZE) multiplied by VF
       divided by vector size.  */
    unsigned int vec_stmts_size;
+   /* Whether the scalar computations use two different operators.  */
+   bool two_operators;
  };
  
  
*************** typedef struct _slp_instance {
*** 146,151 ****
--- 148,154 ----
  #define SLP_TREE_VEC_STMTS(S)                    (S)->vec_stmts
  #define SLP_TREE_NUMBER_OF_VEC_STMTS(S)          (S)->vec_stmts_size
  #define SLP_TREE_LOAD_PERMUTATION(S)             (S)->load_permutation
+ #define SLP_TREE_TWO_OPERATORS(S)              (S)->two_operators
  
  /* This structure is used in creation of an SLP tree.  Each instance
     corresponds to the same operand in a group of scalar stmts in an SLP

[PATCH] Support vectorizing SLP with mixed plus/minus

Reply via email to