Hi,

I have noticed a big performance decrease in one of my numerical codes
when switching from gcc 4.4 to gcc 4.5. A small test case is attached.
When compiling this test case with "gcc -O3 perf.c -lm -std=c99"
and executing the resulting binary, the CPU time with the head of
the 4.4 branch is about 1.1s, with the head of the trunk it is 2.1s.

This is on a Pentium D CPU. I have verified that both binaries produce
identical results.

If I can do anything to help locate the reason for this slowdown, I'd be
glad to help, but I must admit that I'm no good at interpreting assembler.

Any insight would be greatly appreciated.

Thanks,
  Martin
#include <math.h>
#include <stdlib.h>

static inline double max (double a, double b)
  { return (a>=b) ? a : b; }

static inline int nearest_int (double arg)
  {
  arg += 0.5;
  return (arg>=0) ? (int)arg : (int)arg-1;
  }

void wrec3jj (double l2, double l3, double m2, double m3, double *res, int sz)
  {
  const int expo=250;
  const double srhuge=ldexp(1.,expo),
               tiny=ldexp(1.,-2*expo), srtiny=ldexp(1.,-expo);

  const double m1 = -m2 -m3;
  const double l1min = max(fabs(l2-l3),fabs(m1)),
               l1max = l2 + l3;
  const int ncoef = nearest_int(l1max-l1min)+1;
  const double l2ml3sq = (l2-l3)*(l2-l3),
               pre1 = (l2+l3+1.)*(l2+l3+1.),
               m1sq = m1*m1,
               pre2 = m1*(l2*(l2+1.)-l3*(l3+1.)),
               m3mm2 = m3-m2;

  int i=0;
  res[i] = srtiny;
  double sumfor = (2.*l1min+1.) * res[i]*res[i];

  double c1=1e300;
  double oldfac=0.;

  do
    {
    if (i==ncoef-1) break; // all done
    ++i;
    const double l1 = l1min+i,
                 l1sq = l1*l1;

    const double c1old=fabs(c1);

    const double newfac = sqrt((l1sq-l2ml3sq)*(pre1-l1sq)*(l1sq-m1sq));

    if (i>1)
      {
      const double tmp1 = 1./((l1-1.)*newfac);
      c1 = (2.*l1-1.)*(pre2-(l1sq-l1)*m3mm2) * tmp1;
      res[i] = res[i-1]*c1 - res[i-2]*l1*oldfac*tmp1;
      }
    else
      {
      c1 = (l1>1.000001) ? (2.*l1-1.)*(pre2-(l1sq-l1)*m3mm2)/((l1-1.)*newfac)
                         : (2.*l1-1.)*l1*(m3mm2)/newfac;
      res[i] = res[i-1]*c1;
      }

    oldfac=newfac;
    if (c1old<=fabs(c1)) break;
    }
  while (1);
  }

int main(void)
  {
  double *res = (double *)malloc(1000*sizeof(double));
  for (int m=0; m<1000000; ++m)
    wrec3jj (100, 60, 60, -50, res, 1000);
  return 0;
  }

Reply via email to