https://gcc.gnu.org/bugzilla/show_bug.cgi?id=69720
Richard Biener <rguenth at gcc dot gnu.org> changed:
What |Removed |Added
----------------------------------------------------------------------------
Priority|P1 |P2
Known to work| |4.6.4
Target Milestone|6.0 |4.9.4
Summary|[6 Regression] wrong code |[4.9/5/6 Regression] wrong
|at -O3 on x86_64-linux-gnu |code at -O3 on
| |x86_64-linux-gnu
Known to fail| |4.7.3, 4.8.5, 4.9.3, 5.3.0,
| |6.0
--- Comment #4 from Richard Biener <rguenth at gcc dot gnu.org> ---
Ok, so it looks like inner loop reduction for outer loop vectorization isn't
handled correctly if ncopies is > 1.
extern void abort (void);
int a[128];
double b[128] = { 1., 2., 3., 4. };
void __attribute__((noinline)) foo()
{
int i;
for (i = 0; i < 128; ++i)
{
double tem1 = b[i];
for (int j = 0; j < 32; ++j)
tem1 += 1;
b[i] = tem1;
a[i] = i;
}
}
int main()
{
foo ();
if (b[0] != 33. || b[1] != 34.
|| b[2] != 35. || b[3] != 36.)
abort ();
return 0;
}
This is vectorized to
<bb 4>:
# tem1_20 = PHI <tem1_7(5), tem1_6(3)>
# j_21 = PHI <j_8(5), 0(3)>
# ivtmp_12 = PHI <ivtmp_1(5), 32(3)>
# vect_tem1_7.9_26 = PHI <vect_tem1_7.9_28(5), { 0.0, 0.0 }(3)>
# vect_tem1_7.9_29 = PHI <vect_tem1_7.9_30(5), { 0.0, 0.0 }(3)>
vect_tem1_7.9_28 = vect_tem1_7.9_26 + vect_cst__27;
vect_tem1_7.9_30 = vect_tem1_7.9_29 + vect_cst__27;
tem1_7 = tem1_20 + 1.0e+0;
j_8 = j_21 + 1;
ivtmp_1 = ivtmp_12 - 1;
if (ivtmp_1 != 0)
goto <bb 5>;
else
goto <bb 6>;
<bb 5>:
goto <bb 4>;
<bb 6>:
# tem1_16 = PHI <tem1_7(4)>
# vect_tem1_7.9_31 = PHI <vect_tem1_7.9_28(4)>
# vect_tem1_7.9_32 = PHI <vect_tem1_7.9_30(4)>
vect_tem1_7.11_33 = vect_tem1_7.9_31 + vect_tem1_6.7_23;
MEM[(double *)vectp_b.12_34] = vect_tem1_7.11_33;
vectp_b.12_37 = vectp_b.12_34 + 16;
MEM[(double *)vectp_b.12_37] = vect_tem1_7.9_32;
note how we miss a second
vect_tem1_7.11_33 = vect_tem1_7.9_31 + vect_tem1_6.7_23;
to adjust the second inner reduction PHI. Already broken in GCC 4.8 and 4.7.