Re: [PATCH] Fixing PR59006 and PR58921 by delaying loop invariant hoisting in vectorizer.

Richard Biener Tue, 14 Jan 2014 02:51:55 -0800

On Tue, 14 Jan 2014, Richard Biener wrote:

> On Mon, 13 Jan 2014, Cong Hou wrote:
> 
> > I noticed that LIM could not hoist vector invariant, and that is why
> > my first implementation tries to hoist them all.
> 
> Yes, I filed PR59786 for this.  I'll see if I can come up with
> a fix suitable for stage3.
> 
> > In addition, there are two disadvantages of hoisting invariant load +
> > lim method:
> > 
> > First, for some instructions the scalar version is faster than the
> > vector version, and in this case hoisting scalar instructions before
> > vectorization is better. Those instructions include data
> > packing/unpacking, integer multiplication with SSE2, etc..
> > 
> > Second, it may use more SIMD registers.
> > 
> > The following code shows a simple example:
> > 
> > char *a, *b, *c;
> > for (int i = 0; i < N; ++i)
> >   a[i] = b[0] * c[0] + a[i];
> > 
> > Vectorizing b[0]*c[0] is worse than loading the result of b[0]*c[0]
> > into a vector.
> 
> Yes.  I've tried with adjusting the vec_def_type as in the prototype
> patch I sent before christmas but that's quite intrusive for this
> stage.  You could argue that performing invariant motion is not
> really the vectorizers main task and that a combination of hoisting
> only the load, later LIM hoisting the rest and then tree-vect-generic.c
> demoting vector ops to scalar ops (unimplemented, but also a useful
> general optimization) would work as well.


For example with the untested following.  Not sure if the LIM change
is appropriate at this stage (it's handling of "cost" is weird, and
in other places of the compiler we simply aggressively hoist
invariants and expect RTL to fixup register pressure issues).

The lowering change looks more like sth for forwprop but that
runs quite late after vectorization.  tree-vect-generic could
at least factor in whether the target has a scalar op of that
kind and whether that is maybe more expensive (though trading
two vector splats for one is very likely offsetting that).  It
also would need to consider the case where this moves a vector
splat inside a loop when handling the testcases we talk about
without improved invariant motion.

Any comments?  Anything we want to fix before 4.9?  The
testcases are optimized by RTL invariant motion but they
perform a vector addition.  For example

void test1 (int* a, int* b)
{
  int i;
  for (i = 0; i < 100000; ++i)
    a[i] = *b + 1;
}

gets

.L7:
        movd    (%rsi), %xmm1
        leaq    (%rdi,%rdx,4), %rdx
        xorl    %eax, %eax
        pshufd  $0, %xmm1, %xmm0
        paddd   .LC0(%rip), %xmm0
        .p2align 4,,10
        .p2align 3
.L4:
        addl    $1, %eax
        addq    $16, %rdx
        movaps  %xmm0, -16(%rdx)
        cmpl    %eax, %ecx
        ja      .L4

instead of

.L7:
        movl    (%rsi), %eax
        leaq    (%rdi,%rdx,4), %rdx
        addl    $1, %eax
        movl    %eax, -12(%rsp)
        xorl    %eax, %eax
        movd    -12(%rsp), %xmm1
        pshufd  $0, %xmm1, %xmm0
        .p2align 4,,10
        .p2align 3
.L4:
        addl    $1, %eax
        addq    $16, %rdx
        movaps  %xmm0, -16(%rdx)
        cmpl    %eax, %ecx
        ja      .L4

which because of the by default disabled inter-unit moves looks
even more expensive.  With inter-unit moves we get

.L7:
        movl    (%rsi), %eax
        leaq    (%rdi,%rdx,4), %rdx
        addl    $1, %eax
        movd    %eax, %xmm0
        xorl    %eax, %eax
        pshufd  $0, %xmm0, %xmm0
        .p2align 4,,10
        .p2align 3
.L4:
        addl    $1, %eax
        movaps  %xmm0, (%rdx)
        addq    $16, %rdx
        cmpl    %eax, %ecx
        ja      .L4

not sure if the avoided constant pool load offsets the inter-unit
move here (depends on the kind of pipeline constraints that has,
the above is with corei7 tuning).

It looks to me that demoting vector to scalar ops might be
better performed at RTL level?  Plus the reverse op as seen
from the above example where it isn't all clear which
variant is better (which probably depends quite some bit
on the CPU architecture).

Thanks,
Richard.

Index: gcc/tree-ssa-loop-im.c
===================================================================
*** gcc/tree-ssa-loop-im.c      (revision 206599)
--- gcc/tree-ssa-loop-im.c      (working copy)
*************** stmt_cost (gimple stmt)
*** 533,538 ****
--- 533,541 ----
        return 0;
  
      default:
+       /* All vector operations are expensive.  */
+       if (VECTOR_TYPE_P (gimple_expr_type (stmt)))
+       return LIM_EXPENSIVE;
        return 1;
      }
  }
Index: gcc/tree-vect-generic.c
===================================================================
*** gcc/tree-vect-generic.c     (revision 206599)
--- gcc/tree-vect-generic.c     (working copy)
*************** lower_vec_perm (gimple_stmt_iterator *gs
*** 1335,1340 ****
--- 1335,1357 ----
    update_stmt (gsi_stmt (*gsi));
  }
  
+ /* If OP is a uniform vector return the element it is a splat from.  */
+ 
+ static tree
+ ssa_uniform_vector_p (tree op)
+ {
+   if (TREE_CODE (op) == VECTOR_CST
+       || TREE_CODE (op) == CONSTRUCTOR)
+     return uniform_vector_p (op);
+   if (TREE_CODE (op) == SSA_NAME)
+     {
+       gimple def_stmt = SSA_NAME_DEF_STMT (op);
+       if (gimple_assign_single_p (def_stmt))
+       return uniform_vector_p (gimple_assign_rhs1 (def_stmt));
+     }
+   return NULL_TREE;
+ }
+ 
  /* Process one statement.  If we identify a vector operation, expand it.  */
  
  static void
*************** expand_vector_operations_1 (gimple_stmt_
*** 1388,1393 ****
--- 1405,1430 ----
    if (TREE_CODE (type) != VECTOR_TYPE)
      return;
  
+   if (code == NEGATE_EXPR
+       || code == PLUS_EXPR
+       || code == MINUS_EXPR
+       || code == MULT_EXPR)
+     {
+       tree srhs1, srhs2 = NULL_TREE;
+       if ((srhs1 = ssa_uniform_vector_p (rhs1)) != NULL_TREE
+         && (rhs2 == NULL_TREE
+             || (srhs2 = ssa_uniform_vector_p (rhs2)) != NULL_TREE))
+       {
+         tree slhs = make_ssa_name (TREE_TYPE (srhs1), NULL);
+         gimple repl = gimple_build_assign_with_ops (code, slhs, srhs1, srhs2);
+         gsi_insert_before (gsi, repl, GSI_SAME_STMT);
+         gimple_assign_set_rhs_from_tree (gsi,
+                                          build_vector_from_val (type, slhs));
+         update_stmt (stmt);
+         return;
+       }
+     }
+ 
    if (code == NOP_EXPR
        || code == FLOAT_EXPR
        || code == FIX_TRUNC_EXPR
*************** expand_vector_operations_1 (gimple_stmt_
*** 1433,1447 ****
        if (VECTOR_INTEGER_TYPE_P (TREE_TYPE (rhs2)))
          {
            tree first;
!           gimple def_stmt;
! 
!           if ((TREE_CODE (rhs2) == VECTOR_CST
!              && (first = uniform_vector_p (rhs2)) != NULL_TREE)
!             || (TREE_CODE (rhs2) == SSA_NAME
!                 && (def_stmt = SSA_NAME_DEF_STMT (rhs2))
!                 && gimple_assign_single_p (def_stmt)
!                 && (first = uniform_vector_p
!                     (gimple_assign_rhs1 (def_stmt))) != NULL_TREE))
              {
                gimple_assign_set_rhs2 (stmt, first);
                update_stmt (stmt);
--- 1470,1476 ----
        if (VECTOR_INTEGER_TYPE_P (TREE_TYPE (rhs2)))
          {
            tree first;
!           if ((first = ssa_uniform_vector_p (rhs2)) != NULL_TREE)
              {
                gimple_assign_set_rhs2 (stmt, first);
                update_stmt (stmt);

Re: [PATCH] Fixing PR59006 and PR58921 by delaying loop invariant hoisting in vectorizer.

Reply via email to