Hi, I have noticed a big performance decrease in one of my numerical codes when switching from gcc 4.4 to gcc 4.5. A small test case is attached. When compiling this test case with "gcc -O3 perf.c -lm -std=c99" and executing the resulting binary, the CPU time with the head of the 4.4 branch is about 1.1s, with the head of the trunk it is 2.1s.
This is on a Pentium D CPU. I have verified that both binaries produce identical results. If I can do anything to help locate the reason for this slowdown, I'd be glad to help, but I must admit that I'm no good at interpreting assembler. Any insight would be greatly appreciated. Thanks, Martin
#include <math.h> #include <stdlib.h> static inline double max (double a, double b) { return (a>=b) ? a : b; } static inline int nearest_int (double arg) { arg += 0.5; return (arg>=0) ? (int)arg : (int)arg-1; } void wrec3jj (double l2, double l3, double m2, double m3, double *res, int sz) { const int expo=250; const double srhuge=ldexp(1.,expo), tiny=ldexp(1.,-2*expo), srtiny=ldexp(1.,-expo); const double m1 = -m2 -m3; const double l1min = max(fabs(l2-l3),fabs(m1)), l1max = l2 + l3; const int ncoef = nearest_int(l1max-l1min)+1; const double l2ml3sq = (l2-l3)*(l2-l3), pre1 = (l2+l3+1.)*(l2+l3+1.), m1sq = m1*m1, pre2 = m1*(l2*(l2+1.)-l3*(l3+1.)), m3mm2 = m3-m2; int i=0; res[i] = srtiny; double sumfor = (2.*l1min+1.) * res[i]*res[i]; double c1=1e300; double oldfac=0.; do { if (i==ncoef-1) break; // all done ++i; const double l1 = l1min+i, l1sq = l1*l1; const double c1old=fabs(c1); const double newfac = sqrt((l1sq-l2ml3sq)*(pre1-l1sq)*(l1sq-m1sq)); if (i>1) { const double tmp1 = 1./((l1-1.)*newfac); c1 = (2.*l1-1.)*(pre2-(l1sq-l1)*m3mm2) * tmp1; res[i] = res[i-1]*c1 - res[i-2]*l1*oldfac*tmp1; } else { c1 = (l1>1.000001) ? (2.*l1-1.)*(pre2-(l1sq-l1)*m3mm2)/((l1-1.)*newfac) : (2.*l1-1.)*l1*(m3mm2)/newfac; res[i] = res[i-1]*c1; } oldfac=newfac; if (c1old<=fabs(c1)) break; } while (1); } int main(void) { double *res = (double *)malloc(1000*sizeof(double)); for (int m=0; m<1000000; ++m) wrec3jj (100, 60, 60, -50, res, 1000); return 0; }