http://gcc.gnu.org/bugzilla/show_bug.cgi?id=56522
--- Comment #2 from Jakub Jelinek <jakub at gcc dot gnu.org> 2013-03-06 15:06:41 UTC --- Created attachment 29598 --> http://gcc.gnu.org/bugzilla/attachment.cgi?id=29598 assign.c With -O3 -march=corei7 -fomit-frame-pointer -funroll-loops -ffast-math the different in *.optimized dump from r196262 to r196263 is just: @@ -176,7 +176,6 @@ Assignment (long int[101] * x) short int[101][101] * pretmp_418; long int _429; long int _431; - unsigned long _432; long unsigned int patt_438; unsigned int _440; long unsigned int patt_441; @@ -293,8 +292,7 @@ Assignment (long int[101] * x) _108 = _130 >> 3; _89 = -_108; _72 = (short unsigned int) _89; - _432 = _89 & 1; - prolog_loop_niters.59_193 = (short unsigned int) _432; + prolog_loop_niters.59_193 = _72 & 1; if (prolog_loop_niters.59_193 == 0) goto <bb 19>; else @@ -307,7 +305,7 @@ Assignment (long int[101] * x) <bb 19>: # j_288 = PHI <1(18), 0(17)> # c_287 = PHI <c_141(18), 9223372036854775807(17)> - prolog_loop_adjusted_niters.60_357 = _89 & 1; + prolog_loop_adjusted_niters.60_357 = (sizetype) prolog_loop_niters.59_193; niters.61_359 = 101 - prolog_loop_niters.59_193; base_off.68_53 = prolog_loop_adjusted_niters.60_357 * 8; vect_p.69_48 = pretmp_386 + base_off.68_53; >From the bugreport, it isn't clear if you were measuring -m32 or -m64 performance, but I guess the *.optimized dump change could just increase register pressure and pessimize the loop RA or something.