I have the following testcase:
#include <math.h>

#define N 256
#define M 256

double mat1[N][3];
int mat2[M][4];

double point = 0;
int tmp;

double
foo ()
{
  int i, j, k, l, ntimes;
  int arr[4];

  for (ntimes = 0; ntimes < 50000000; ntimes++)
    {
      for (i = 0; i < M; i++)
        {
          for (j = 0; j < 4; j++)
            arr[j] = mat2[i][j];
          if (arr[0] == tmp || arr[1] == tmp ||
              arr[2] == tmp || arr[3] == tmp)
            {
              for (j = 0; j < 4; j++)
                for (k = 0; k < 3; k++)
                  point += (double) mat1[arr[j]][k];
            }
        }
    }

}


void
init ()
{
  int i, j;
  for (i = 0; i < N; i++)
    {
      mat1[i][0] = (double) i;
      mat1[i][1] = (double) i + 1;
      mat1[i][2] = (double) i + 2;
    }

  for (i = 0; i < M; i++)
    {
      mat2[j][0] = 0;
      mat2[j][1] = 0;
      mat2[j][2] = 0;
      mat2[j][3] = 0;
    }
  tmp = 33;
}

int
main ()
{
  init ();
  foo ();
}

Is there an option that GCC will recognize the load-after-store of 
arr[0], arr[1], arr[2] and arr[3] (after unrolling) and will replace them all
with registers? Is there a flag doing that?

Doing such transformation will improve the testcase by 40%.

I've tried that on GCC4.4.0, r139150, with -O3 (-funroll-loops -fgcse-las makes
it worse).


-- 
           Summary: GCC for Cell SPU produces poor code when there is load-
                    after-store in different loops
           Product: gcc
           Version: 4.4.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: c
        AssignedTo: unassigned at gcc dot gnu dot org
        ReportedBy: tehila at il dot ibm dot com
GCC target triplet: Cell SPU


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=37221

Reply via email to