https://gcc.gnu.org/bugzilla/show_bug.cgi?id=94269
Bug ID: 94269 Summary: widening_mul should consider block frequency Product: gcc Version: 10.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: tree-optimization Assignee: unassigned at gcc dot gnu.org Reporter: felix.yang at huawei dot com Target Milestone: --- Test case: float calc(long n, float *x, int inc_x, float *y, int inc_y) { float dot = 0.0; int ix = 0, iy = 0; if (n < 0) { return dot; } int i = 0; while (i < n) { dot += y[iy] * x[ix]; ix += inc_x; iy += inc_y; i++; } return dot; } Command line: aarch64-linux-gnu-gcc -S -O2 -fopt-info -ftree-loop-vectorize -funsafe-math-optimizations -march=armv8.2-a+sve -msve-vector-bits=256 calc.c calc: .LFB0: .cfi_startproc cmp x0, 0 ble .L4 mov w7, w0 mov x5, x3 mov w6, 32 mov x3, x1 mov x1, 0 index z4.s, #0, w4 index z3.s, #0, w2 whilelo p0.s, wzr, w0 mov z0.s, #0 .p2align 3,,7 .L3: ld1w z1.s, p0/z, [x5, z4.s, sxtw 2] ld1w z2.s, p0/z, [x3, z3.s, sxtw 2] add x1, x1, 8 fmla z0.s, p0/m, z1.s, z2.s smaddl x5, w4, w6, x5 <============== whilelo p0.s, w1, w7 smaddl x3, w2, w6, x3 <============== b.any .L3 ptrue p0.b, vl32 faddv s0, p0, z0.s ret Command line: aarch64-linux-gnu-gcc -S -O2 -fopt-info -ftree-loop-vectorize -funsafe-math-optimizations -march=armv8.2-a+sve -msve-vector-bits=256 calc.c -fdisable-tree-widening_mul calc: .LFB0: .cfi_startproc cmp x0, 0 ble .L4 sbfiz x8, x4, 5, 32 sbfiz x7, x2, 5, 32 mov w6, w0 mov x5, x3 mov x3, x1 mov x1, 0 index z4.s, #0, w4 index z3.s, #0, w2 whilelo p0.s, wzr, w0 mov z0.s, #0 ptrue p1.b, vl32 .p2align 3,,7 .L3: ld1w z1.s, p0/z, [x5, z4.s, sxtw 2] ld1w z2.s, p0/z, [x3, z3.s, sxtw 2] add x1, x1, 8 fmul z1.s, z1.s, z2.s add x5, x5, x8 <============= fadd z0.s, p0/m, z0.s, z1.s add x3, x3, x7 <============= whilelo p0.s, w1, w6 b.any .L3 faddv s0, p1, z0.s ret widening_mul phase moves the two multiply instructions from outside the loop to inside the loop, merging with the two add instructions separately. This increases the cost of the loop. I think widening_mul should consider block frequency when doing such a combination. I mean something like: diff --git a/gcc/tree-ssa-math-opts.c b/gcc/tree-ssa-math-opts.c index 54ba035..4439452 100644 --- a/gcc/tree-ssa-math-opts.c +++ b/gcc/tree-ssa-math-opts.c @@ -2721,7 +2721,10 @@ convert_plusminus_to_widen (gimple_stmt_iterator *gsi, gimple *stmt, { if (!has_single_use (rhs1) || !is_widening_mult_p (rhs1_stmt, &type1, &mult_rhs1, - &type2, &mult_rhs2)) + &type2, &mult_rhs2) + || (gimple_bb (rhs1_stmt) != gimple_bb (stmt) + && gimple_bb (rhs1_stmt)->count.to_frequency(cfun) + < gimple_bb (stmt)->count.to_frequency(cfun))) return false; add_rhs = rhs2; conv_stmt = conv1_stmt; @@ -2730,7 +2733,10 @@ convert_plusminus_to_widen (gimple_stmt_iterator *gsi, gimple *stmt, { if (!has_single_use (rhs2) || !is_widening_mult_p (rhs2_stmt, &type1, &mult_rhs1, - &type2, &mult_rhs2)) + &type2, &mult_rhs2) + || (gimple_bb (rhs2_stmt) != gimple_bb (stmt) + && gimple_bb (rhs2_stmt)->count.to_frequency(cfun) + < gimple_bb (stmt)->count.to_frequency(cfun))) return false; add_rhs = rhs1; conv_stmt = conv2_stmt;