https://gcc.gnu.org/bugzilla/show_bug.cgi?id=68030
--- Comment #7 from rguenther at suse dot de <rguenther at suse dot de> --- On May 10, 2016 6:25:57 PM GMT+02:00, "amker at gcc dot gnu.org" <gcc-bugzi...@gcc.gnu.org> wrote: >https://gcc.gnu.org/bugzilla/show_bug.cgi?id=68030 > >--- Comment #6 from amker at gcc dot gnu.org --- >It's not only the vectorizer generating CSE sub-optimal code, pre and >lim also >do this kind of transform. In another PR I suggested swapping LIM and PRE to cleanup after LIM. IIRC that had some testsuite regressions. >Compiling the attached example with below command line > >$ ./gcc -S -Ofast -march=haswell pr68030.c -o pr68030.S >-fdump-tree-vect-details -fdump-tree-slp -fdump-tree-ivopts-details >-fdump-tree-all -fno-tree-vectorize > >Gives below dump info before IVOPT: > > <bb 2>: > local_Filter_33 = global_Filters; > pretmp_887 = global_Output; > pretmp_889 = global_Input; > goto <bb 7>; > > <bb 3>: > > <bb 4>: > # ix_187 = PHI <_202(3), 2(7)> > # ivtmp_1065 = PHI <ivtmp_1064(3), 512(7)> > _154 = ix_187 + -2; > _157 = _154 + _971; > _158 = (long unsigned int) _157; > _159 = _158 * 4; > _160 = pretmp_889 + _159; > _161 = *_160; > _165 = *local_Filter_33; > _166 = _161 * _165; > _170 = ix_187 + -1; > _173 = _170 + _971; > _174 = (long unsigned int) _173; > _175 = _174 * 4; > _176 = pretmp_889 + _175; > _177 = *_176; > _181 = MEM[(float *)local_Filter_33 + 4B]; > _182 = _177 * _181; > _81 = _166 + _182; > _189 = ix_187 + _971; > _190 = (long unsigned int) _189; > _191 = _190 * 4; > _192 = pretmp_889 + _191; > _193 = *_192; > _197 = MEM[(float *)local_Filter_33 + 8B]; > _198 = _193 * _197; > _202 = ix_187 + 1; > _205 = _202 + _971; > _206 = (long unsigned int) _205; > _207 = _206 * 4; > _208 = pretmp_889 + _207; > _209 = *_208; > _213 = MEM[(float *)local_Filter_33 + 12B]; > _214 = _209 * _213; > _218 = ix_187 + 2; > _221 = _218 + _971; > _222 = (long unsigned int) _221; > _223 = _222 * 4; > _224 = pretmp_889 + _223; > _225 = *_224; > _229 = MEM[(float *)local_Filter_33 + 16B]; > _230 = _225 * _229; > _82 = _214 + _230; > _67 = _81 + _82; > _243 = _154 + _980; > _244 = (long unsigned int) _243; > _245 = _244 * 4; > _246 = pretmp_889 + _245; > _247 = *_246; > _251 = MEM[(float *)local_Filter_33 + 20B]; > _252 = _247 * _251; > _259 = _170 + _980; > _260 = (long unsigned int) _259; > _261 = _260 * 4; > _262 = pretmp_889 + _261; > _263 = *_262; > _267 = MEM[(float *)local_Filter_33 + 24B]; > _268 = _263 * _267; > _78 = _252 + _268; > _275 = ix_187 + _980; > _276 = (long unsigned int) _275; > _277 = _276 * 4; > _278 = pretmp_889 + _277; > _279 = *_278; > _283 = MEM[(float *)local_Filter_33 + 28B]; > _284 = _279 * _283; > _72 = _198 + _284; > _291 = _202 + _980; > _292 = (long unsigned int) _291; > _293 = _292 * 4; > _294 = pretmp_889 + _293; > _295 = *_294; > _299 = MEM[(float *)local_Filter_33 + 32B]; > _300 = _295 * _299; > _307 = _218 + _980; > _308 = (long unsigned int) _307; > _309 = _308 * 4; > _310 = pretmp_889 + _309; > _311 = *_310; > _315 = MEM[(float *)local_Filter_33 + 36B]; > _316 = _311 * _315; > _79 = _300 + _316; > _56 = _78 + _79; > _329 = _154 + _985; > _330 = (long unsigned int) _329; > _331 = _330 * 4; > _332 = pretmp_889 + _331; > _333 = *_332; > _337 = MEM[(float *)local_Filter_33 + 40B]; > _338 = _333 * _337; > _345 = _170 + _985; > _346 = (long unsigned int) _345; > _347 = _346 * 4; > _348 = pretmp_889 + _347; > _349 = *_348; > _353 = MEM[(float *)local_Filter_33 + 44B]; > _354 = _349 * _353; > _75 = _338 + _354; > _361 = ix_187 + _985; > _362 = (long unsigned int) _361; > _363 = _362 * 4; > _364 = pretmp_889 + _363; > _365 = *_364; > _369 = MEM[(float *)local_Filter_33 + 48B]; > _370 = _365 * _369; > _377 = _202 + _985; > _378 = (long unsigned int) _377; > _379 = _378 * 4; > _380 = pretmp_889 + _379; > _381 = *_380; > _385 = MEM[(float *)local_Filter_33 + 52B]; > _386 = _381 * _385; > _393 = _218 + _985; > _394 = (long unsigned int) _393; > _395 = _394 * 4; > _396 = pretmp_889 + _395; > _397 = *_396; > _401 = MEM[(float *)local_Filter_33 + 56B]; > _402 = _397 * _401; > _76 = _386 + _402; > _495 = _75 + _76; > _415 = _154 + _991; > _416 = (long unsigned int) _415; > _417 = _416 * 4; > _418 = pretmp_889 + _417; > _419 = *_418; > _423 = MEM[(float *)local_Filter_33 + 60B]; > _424 = _419 * _423; > _431 = _170 + _991; > _432 = (long unsigned int) _431; > _433 = _432 * 4; > _434 = pretmp_889 + _433; > _435 = *_434; > _439 = MEM[(float *)local_Filter_33 + 64B]; > _440 = _435 * _439; > _572 = _424 + _440; > _447 = ix_187 + _991; > _448 = (long unsigned int) _447; > _449 = _448 * 4; > _450 = pretmp_889 + _449; > _451 = *_450; > _455 = MEM[(float *)local_Filter_33 + 68B]; > _456 = _451 * _455; > _73 = _370 + _456; > _65 = _72 + _73; > _55 = _65 + _67; > _25 = _55 + _56; > _19 = _25 + _495; > _463 = _202 + _991; > _464 = (long unsigned int) _463; > _465 = _464 * 4; > _466 = pretmp_889 + _465; > _467 = *_466; > _471 = MEM[(float *)local_Filter_33 + 72B]; > _472 = _467 * _471; > _479 = _218 + _991; > _480 = (long unsigned int) _479; > _481 = _480 * 4; > _482 = pretmp_889 + _481; > _483 = *_482; > _487 = MEM[(float *)local_Filter_33 + 76B]; > _488 = _483 * _487; > _556 = _472 + _488; > _20 = _556 + _572; > _429 = _19 + _20; > _501 = _154 + _997; > _502 = (long unsigned int) _501; > _503 = _502 * 4; > _504 = pretmp_889 + _503; > _505 = *_504; > _509 = MEM[(float *)local_Filter_33 + 80B]; > _510 = _505 * _509; > _517 = _170 + _997; > _518 = (long unsigned int) _517; > _519 = _518 * 4; > _520 = pretmp_889 + _519; > _521 = *_520; > _525 = MEM[(float *)local_Filter_33 + 84B]; > _526 = _521 * _525; > _444 = _510 + _526; > _533 = ix_187 + _997; > _534 = (long unsigned int) _533; > _535 = _534 * 4; > _536 = pretmp_889 + _535; > _537 = *_536; > _541 = MEM[(float *)local_Filter_33 + 88B]; > _542 = _537 * _541; > _549 = _202 + _997; > _550 = (long unsigned int) _549; > _551 = _550 * 4; > _552 = pretmp_889 + _551; > _553 = *_552; > _557 = MEM[(float *)local_Filter_33 + 92B]; > _558 = _553 * _557; > _565 = _218 + _997; > _566 = (long unsigned int) _565; > _567 = _566 * 4; > _568 = pretmp_889 + _567; > _569 = *_568; > _573 = MEM[(float *)local_Filter_33 + 96B]; > _574 = _569 * _573; > _445 = _558 + _574; > _430 = _444 + _445; > _257 = _429 + _430; > sum_575 = _257 + _542; > _21 = pretmp_887 + _363; > *_21 = sum_575; > ivtmp_1064 = ivtmp_1065 - 1; > if (ivtmp_1064 != 0) > goto <bb 3>; > else > goto <bb 5>; > > <bb 5>: > ivtmp_1062 = ivtmp_1063 - 1; > if (ivtmp_1062 != 0) > goto <bb 6>; > else > goto <bb 8>; > > <bb 6>: > > <bb 7>: > # iy_186 = PHI <_990(6), 2(2)> > # ivtmp_1063 = PHI <ivtmp_1062(6), 512(2)> > _970 = iy_186 + -2; > _971 = _970 * 516; > _979 = iy_186 + -1; > _980 = _979 * 516; > _985 = iy_186 * 516; > _990 = iy_186 + 1; > _991 = _990 * 516; > _996 = iy_186 + 2; > _997 = _996 * 516; > goto <bb 4>; > > <bb 8>: > return; > >Most memory references in <bb 4> are accessing the same memory object, >but >IVOPT failed to group these IVs because PRE hoists some parts of >address >computation into <bb7>. And PRE/LIM creates more difficult code than >vectorizer because the CSE opportunities are hidden by re-association. > >I will first try to fix vectorizer issue since PRE/LIM issue isn't that >critical because it's only exposed in loops unrolled by tree cunroll, >and in >versioned/peeled loops only.