https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113594
Bug ID: 113594 Summary: Missing partial sum optimziation in the vectorizer. Product: gcc Version: 14.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: tree-optimization Assignee: unassigned at gcc dot gnu.org Reporter: liuhongt at gcc dot gnu.org Target Milestone: --- double foo (short* p, int n) { double sum = 0; for (int i = 0; i != n; i++) sum += p[i] * (double)p[i]; return sum; } w/ fast-math vectorizer generates <bb 5> [local count: 860067200]: # vect_sum_16.8_44 = PHI <vect_sum_12.15_61(5), { 0.0, 0.0, 0.0, 0.0 }(4)> # ivtmp.35_152 = PHI <ivtmp.35_151(5), ivtmp.35_150(4)> # DEBUG BEGIN_STMT # DEBUG D#13 => D#14 * 2 # DEBUG D#12 => p_11(D) + D#13 _149 = (void *) ivtmp.35_152; vect__4.11_47 = MEM <vector(16) short int> [(short int *)_149]; # DEBUG D#11 => *D#12 vect__5.13_48 = [vec_unpack_lo_expr] vect__4.11_47; vect__5.13_49 = [vec_unpack_hi_expr] vect__4.11_47; vect__5.12_50 = [vec_unpack_float_lo_expr] vect__5.13_48; vect__5.12_51 = [vec_unpack_float_hi_expr] vect__5.13_48; vect__5.12_52 = [vec_unpack_float_lo_expr] vect__5.13_49; vect__5.12_53 = [vec_unpack_float_hi_expr] vect__5.13_49; # DEBUG D#10 => (double) D#11 vect_powmult_6.14_55 = vect__5.12_51 * vect__5.12_51; _62 = .FMA (vect__5.12_50, vect__5.12_50, vect_powmult_6.14_55); vect_powmult_6.14_57 = vect__5.12_53 * vect__5.12_53; _45 = .FMA (vect__5.12_52, vect__5.12_52, vect_powmult_6.14_57); _46 = _45 + _62; # DEBUG D#9 => D#10 * D#10 vect_sum_12.15_61 = vect_sum_16.8_44 + _46; # DEBUG sum => D#8 # DEBUG BEGIN_STMT # DEBUG i => NULL # DEBUG sum => D#8 # DEBUG BEGIN_STMT ivtmp.35_151 = ivtmp.35_152 + 32; if (_18 != ivtmp.35_151) goto <bb 5>; [89.00%] else goto <bb 8>; [11.00%] But it can be better with. .... vect_powmult_6.14_55 = .FMA (vect__5.12_51, vect__5.12_51, 0); _62 = .FMA (vect__5.12_50, vect__5.12_50, 0); vect_powmult_6.14_57 = .FMA (vect__5.12_53, vect__5.12_53, 0); _45 = .FMA (vect__5.12_52, vect__5.12_52, 0); ivtmp.35_151 = ivtmp.35_152 + 32; if (_18 != ivtmp.35_151) goto <bb 5>; [89.00%] else goto <bb 8>; [11.00%] <bb 8> _tmp1 = vect_powmult_6.14_55 + _62; _tmp2 = vect_powmult_6.14_57 + _45; _tmp3 = _tmp1 + _tmp2; _tmp4_scalar = .REDUCE_SUM (_tmp3);