[Bug tree-optimization/94269] New: widening_mul should consider block frequency

felix.yang at huawei dot com Mon, 23 Mar 2020 02:48:09 -0700

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=94269


            Bug ID: 94269
           Summary: widening_mul should consider block frequency
           Product: gcc
           Version: 10.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: tree-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: felix.yang at huawei dot com
  Target Milestone: ---

Test case:

float
calc(long n, float *x, int inc_x,
             float *y, int inc_y)
{
  float dot = 0.0;
  int ix = 0, iy = 0;

  if (n < 0) {
    return dot;
  }

  int i = 0;
  while (i < n) {
    dot += y[iy] * x[ix];
    ix  += inc_x;
    iy  += inc_y;
    i++;
  }

  return dot;
}

Command line: aarch64-linux-gnu-gcc -S -O2 -fopt-info -ftree-loop-vectorize
-funsafe-math-optimizations -march=armv8.2-a+sve -msve-vector-bits=256 calc.c

calc:
.LFB0:
        .cfi_startproc
        cmp     x0, 0
        ble     .L4
        mov     w7, w0
        mov     x5, x3
        mov     w6, 32
        mov     x3, x1
        mov     x1, 0
        index   z4.s, #0, w4
        index   z3.s, #0, w2
        whilelo p0.s, wzr, w0
        mov     z0.s, #0
        .p2align 3,,7
.L3:
        ld1w    z1.s, p0/z, [x5, z4.s, sxtw 2]
        ld1w    z2.s, p0/z, [x3, z3.s, sxtw 2]
        add     x1, x1, 8
        fmla    z0.s, p0/m, z1.s, z2.s
        smaddl  x5, w4, w6, x5           <==============
        whilelo p0.s, w1, w7
        smaddl  x3, w2, w6, x3           <==============
        b.any   .L3
        ptrue   p0.b, vl32
        faddv   s0, p0, z0.s
        ret

Command line: aarch64-linux-gnu-gcc -S -O2 -fopt-info -ftree-loop-vectorize
-funsafe-math-optimizations -march=armv8.2-a+sve -msve-vector-bits=256 calc.c
-fdisable-tree-widening_mul

calc:
.LFB0:
        .cfi_startproc
        cmp     x0, 0
        ble     .L4
        sbfiz   x8, x4, 5, 32
        sbfiz   x7, x2, 5, 32
        mov     w6, w0
        mov     x5, x3
        mov     x3, x1
        mov     x1, 0
        index   z4.s, #0, w4
        index   z3.s, #0, w2
        whilelo p0.s, wzr, w0
        mov     z0.s, #0
        ptrue   p1.b, vl32
        .p2align 3,,7
.L3:
        ld1w    z1.s, p0/z, [x5, z4.s, sxtw 2]
        ld1w    z2.s, p0/z, [x3, z3.s, sxtw 2]
        add     x1, x1, 8
        fmul    z1.s, z1.s, z2.s
        add     x5, x5, x8             <=============
        fadd    z0.s, p0/m, z0.s, z1.s
        add     x3, x3, x7             <=============
        whilelo p0.s, w1, w6
        b.any   .L3
        faddv   s0, p1, z0.s
        ret

widening_mul phase moves the two multiply instructions from outside the loop to
inside the loop, merging with the two add instructions separately.  This
increases the cost of the loop.  

I think widening_mul should consider block frequency when doing such a
combination.
I mean something like:
diff --git a/gcc/tree-ssa-math-opts.c b/gcc/tree-ssa-math-opts.c
index 54ba035..4439452 100644
--- a/gcc/tree-ssa-math-opts.c
+++ b/gcc/tree-ssa-math-opts.c
@@ -2721,7 +2721,10 @@ convert_plusminus_to_widen (gimple_stmt_iterator *gsi,
gimple *stmt,
     {
       if (!has_single_use (rhs1)
          || !is_widening_mult_p (rhs1_stmt, &type1, &mult_rhs1,
-                                 &type2, &mult_rhs2))
+                                 &type2, &mult_rhs2)
+         || (gimple_bb (rhs1_stmt) != gimple_bb (stmt)
+             && gimple_bb (rhs1_stmt)->count.to_frequency(cfun)
+                < gimple_bb (stmt)->count.to_frequency(cfun)))
        return false;
       add_rhs = rhs2;
       conv_stmt = conv1_stmt;
@@ -2730,7 +2733,10 @@ convert_plusminus_to_widen (gimple_stmt_iterator *gsi,
gimple *stmt,
     {
       if (!has_single_use (rhs2)
          || !is_widening_mult_p (rhs2_stmt, &type1, &mult_rhs1,
-                                 &type2, &mult_rhs2))
+                                 &type2, &mult_rhs2)
+         || (gimple_bb (rhs2_stmt) != gimple_bb (stmt)
+             && gimple_bb (rhs2_stmt)->count.to_frequency(cfun)
+                < gimple_bb (stmt)->count.to_frequency(cfun)))
        return false;
       add_rhs = rhs1;
       conv_stmt = conv2_stmt;

[Bug tree-optimization/94269] New: widening_mul should consider block frequency

Reply via email to