------- Comment #6 from rahul at icerasemi dot com 2009-09-11 10:03 ------- An interesting regression results as a side effect of loop header copying (this occurs even in vanilla O2). If I modify my original test case to
struct struct_t { int* data; }; void testAddr (struct struct_t* sp, int len) { short i; for (i = 0; i < len; i++) { sp->data[len-i-1] = 0; } } The index is now a short, and I have purposefully added an int to form the final induction variable. With gcc -S -O2 -fdump-tree-all, I get the following SSA short int i; int * D.1220; long unsigned int D.1219; long unsigned int D.1218; long unsigned int D.1217; int D.1216; int D.1215; int * D.1214; <bb 2>: goto <bb 4>; <bb 3>: D.1214_6 = sp_5(D)->data; D.1215_7 = (int) i_1; D.1216_8 = len_4(D) - D.1215_7; D.1217_9 = (long unsigned int) D.1216_8; D.1218_10 = D.1217_9 + -1; D.1219_11 = D.1218_10 * 4; D.1220_12 = D.1214_6 + D.1219_11; *D.1220_12 ={v} 0; i_13 = i_1 + 1; <bb 4>: # i_1 = PHI <0(2), i_13(3)> D.1215_3 = (int) i_1; if (D.1215_3 < len_4(D)) goto <bb 3>; else goto <bb 5>; <bb 5>: return; The following copy propagation and/or FRE passes identify D.1215_7 as a copy of D.1215_3 and we get <bb 3>: D.1214_6 = sp_5(D)->data; D.1216_8 = len_4(D) - D.1215_3; D.1217_9 = (long unsigned int) D.1216_8; D.1218_10 = D.1217_9 + -1; D.1219_11 = D.1218_10 * 4; D.1220_12 = D.1214_6 + D.1219_11; *D.1220_12 = 0; i_13 = i_1 + 1; Loop header copying introduces a PHI for D.1215 <bb 2>: D.1215_19 = 0; if (D.1215_19 < len_4(D)) goto <bb 3>; else goto <bb 4>; <bb 3>: # i_20 = PHI <i_13(3), 0(2)> # D.1215_21 = PHI <D.1215_3(3), D.1215_19(2)> D.1214_6 = sp_5(D)->data; D.1216_8 = len_4(D) - D.1215_21; D.1217_9 = (long unsigned int) D.1216_8; D.1218_10 = D.1217_9 + -1; D.1219_11 = D.1218_10 * 4; D.1220_12 = D.1214_6 + D.1219_11; *D.1220_12 = 0; i_13 = i_20 + 1; D.1215_3 = (int) i_13; if (D.1215_3 < len_4(D)) goto <bb 3>; else goto <bb 4>; This causes IVOpts below, and all subsequent optimisations to fall over. <bb 3>: D.1214_6 = sp_5(D)->data; D.1238_7 = (unsigned int) len_4(D); D.1239_1 = D.1238_7 + 0x0ffffffff; __builtin_loop_start (1, D.1239_1); D.1241_24 = (unsigned int) len_4(D); <bb 4>: # D.1215_21 = PHI <0(3), D.1215_3(5)> # ivtmp.13_14 = PHI <0(3), ivtmp.13_18(5)> __builtin_loop_iteration (1); D.1216_8 = len_4(D) - D.1215_21; D.1217_9 = (long unsigned int) D.1216_8; D.1218_10 = D.1217_9 + -1; D.1219_11 = D.1218_10 * 4; D.1220_12 = D.1214_6 + D.1219_11; *D.1220_12 = 0; D.1240_19 = ivtmp.13_14 + 1; D.1215_23 = (int) D.1240_19; D.1215_3 = D.1215_23; ivtmp.13_18 = ivtmp.13_14 + 1; if (ivtmp.13_18 != D.1241_24) goto <bb 5>; else goto <bb 6>; On this test using -fno-tree-copy-prop -fno-tree-pre results in better optimizations, implying either copy propagating (across blocks) / FREing potential induction variables is undesirable. Or a less ideal solution is disable loop header copying when dealing with type promoted loop indices. -- http://gcc.gnu.org/bugzilla/show_bug.cgi?id=41026