I have the following testcase: #include <math.h> #define N 256 #define M 256
double mat1[N][3]; int mat2[M][4]; double point = 0; int tmp; double foo () { int i, j, k, l, ntimes; int arr[4]; for (ntimes = 0; ntimes < 50000000; ntimes++) { for (i = 0; i < M; i++) { for (j = 0; j < 4; j++) arr[j] = mat2[i][j]; if (arr[0] == tmp || arr[1] == tmp || arr[2] == tmp || arr[3] == tmp) { for (j = 0; j < 4; j++) for (k = 0; k < 3; k++) point += (double) mat1[arr[j]][k]; } } } } void init () { int i, j; for (i = 0; i < N; i++) { mat1[i][0] = (double) i; mat1[i][1] = (double) i + 1; mat1[i][2] = (double) i + 2; } for (i = 0; i < M; i++) { mat2[j][0] = 0; mat2[j][1] = 0; mat2[j][2] = 0; mat2[j][3] = 0; } tmp = 33; } int main () { init (); foo (); } Is there an option that GCC will recognize the load-after-store of arr[0], arr[1], arr[2] and arr[3] (after unrolling) and will replace them all with registers? Is there a flag doing that? Doing such transformation will improve the testcase by 40%. I've tried that on GCC4.4.0, r139150, with -O3 (-funroll-loops -fgcse-las makes it worse). -- Summary: GCC for Cell SPU produces poor code when there is load- after-store in different loops Product: gcc Version: 4.4.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: c AssignedTo: unassigned at gcc dot gnu dot org ReportedBy: tehila at il dot ibm dot com GCC target triplet: Cell SPU http://gcc.gnu.org/bugzilla/show_bug.cgi?id=37221