https://gcc.gnu.org/bugzilla/show_bug.cgi?id=87166
Martin Liška <marxin at gcc dot gnu.org> changed: What |Removed |Added ---------------------------------------------------------------------------- Status|UNCONFIRMED |NEW Last reconfirmed| |2018-08-31 Ever confirmed|0 |1 --- Comment #1 from Martin Liška <marxin at gcc dot gnu.org> --- I hope I found the root cause: perf report -Ofast: # Overhead Command Shared Object Symbol # ........ ............... ............................ ........................................ # 44.48% calculix_peak.a calculix_peak.amd64-m64-mine [.] e_c3d_ 18.77% calculix_peak.a calculix_peak.amd64-m64-mine [.] DVdot33 6.82% calculix_peak.a calculix_peak.amd64-m64-mine [.] Network_findAugmentingPath -Ofast with PGO: # Overhead Command Shared Object Symbol # ........ ............... ............................ ........................................ # 75.30% calculix_peak.a calculix_peak.amd64-m64-mine [.] e_c3d_.cold.0 7.53% calculix_peak.a calculix_peak.amd64-m64-mine [.] DVdot33 2.58% calculix_peak.a calculix_peak.amd64-m64-mine [.] Network_findAugmentingPath 1.63% calculix_peak.a calculix_peak.amd64-m64-mine [.] nident_ So part of the function is put into cold text section. Reason is that train and reference run have totally different coverage. The most hottest code in ref run is never executed in train run: e_c3d.f.gcov: TRAIN run: #####: 591: sm(ii1+2,jj1+2)=sm(ii1,jj1) -: 592: endif -: 593:! -: 594: else -: 595:! -: 596:! buckling matrix -: 597:! -: 598: senergyb= -: 599: & (s11b*w(1,1)+s12b*(w(1,2)+w(2,1)) -: 600: & +s13b*(w(1,3)+w(3,1))+s22b*w(2,2) #####: 601: & +s23b*(w(2,3)+w(3,2))+s33b*w(3,3))*weight #####: 602: sm(ii1,jj1)=sm(ii1,jj1)-senergyb #####: 603: sm(ii1+1,jj1+1)=sm(ii1+1,jj1+1)-senergyb #####: 604: sm(ii1+2,jj1+2)=sm(ii1+2,jj1+2)-senergyb -: 605:! -: 606: endif -: 607:! 5M: 608: ii1=ii1+3 -: 609: enddo 449k: 610: jj1=jj1+3 -: 611: enddo -: 612: else -: 613:! -: 614:! stiffness matrix for static and modal -: 615:! 2nd order calculations -: 616:! -: 617:! large displacement stiffness -: 618:! #####: 619: do i1=1,3 #####: 620: do j1=1,3 #####: 621: vo(i1,j1)=0.d0 #####: 622: do k1=1,nope #####: 623: vo(i1,j1)=vo(i1,j1)+shp(j1,k1)*voldl(i1,k1) -: 624: enddo -: 625: enddo -: 626: enddo -: 627:! #####: 628: if(mattyp.eq.1) then #####: 629: call wcoef(v,vo,al,um) -: 630: endif -: 631:! -: 632:! calculating the total mass of the element for -: 633:! lumping purposes: only for explicit nonlinear -: 634:! dynamic calculations -: 635:! #####: 636: if(mass.and.(iexpl.eq.1)) then #####: 637: summass=summass+rho*xsj -: 638: endif -: 639:! #####: 640: jj1=1 #####: 641: do jj=1,nope -: 642:! #####: 643: ii1=1 #####: 644: do ii=1,jj -: 645:! -: 646:! all products of the shape functions for a given ii -: 647:! and jj -: 648:! #####: 649: do i1=1,3 #####: 650: do j1=1,3 #####: 651: w(i1,j1)=shpj(i1,ii)*shpj(j1,jj) -: 652: enddo -: 653: enddo -: 654:! #####: 655: if(mattyp.eq.1) then -: 656:! #####: 657: do m1=1,3 #####: 658: do m2=1,3 #####: 659: do m3=1,3 #####: 660: do m4=1,3 -: 661: s(ii1+m2-1,jj1+m1-1)= -: 662: & s(ii1+m2-1,jj1+m1-1) #####: 663: & +v(m4,m3,m2,m1)*w(m4,m3)*weight -: 664: enddo -: 665: enddo -: 666: enddo -: 667: enddo -: 668:! #####: 669: elseif(mattyp.eq.2) then -: 670:! #####: 671: call orthonl(w,vo,elas,s,ii1,jj1,weight) -: 672:! -: 673: else -: 674:! #####: 675: do i1=1,3 #####: 676: iii1=ii1+i1-1 #####: 677: do j1=1,3 #####: 678: jjj1=jj1+j1-1 #####: 679: do k1=1,3 #####: 680: do l1=1,3 -: 681: s(iii1,jjj1)=s(iii1,jjj1) #####: 682: & +anisox(i1,k1,j1,l1)*w(k1,l1)*weight #####: 683: do m1=1,3 -: 684: s(iii1,jjj1)=s(iii1,jjj1) -: 685: & +anisox(i1,k1,m1,l1)*w(k1,l1) -: 686: & *vo(j1,m1)*weight -: 687: & +anisox(m1,k1,j1,l1)*w(k1,l1) #####: 688: & *vo(i1,m1)*weight #####: 689: do n1=1,3 -: 690: s(iii1,jjj1)=s(iii1,jjj1) -: 691: & +anisox(m1,k1,n1,l1) -: 692: & *w(k1,l1)*vo(i1,m1)*vo(j1,n1) #####: 693: & *weight -: 694: enddo -: 695: enddo -: 696: enddo -: 697: enddo -: 698: enddo -: 699: enddo -: 700:!SPEC: The immediately preceding loop nest is also available in -: 701:!SPEC: program-generated (much longer) form from the author's -: 702:!SPEC: website (see 454.calculix/Docs) in file anisonl.f -: 703:!SPEC: -: 704:!SPEC: call anisonl(w,vo,elas,s,ii1,jj1,weight) -: 705:!SPEC: -: 706: endif -: 707:! -: 708:! stress stiffness -: 709:! -: 710: senergy= REF run: #####: 591: sm(ii1+2,jj1+2)=sm(ii1,jj1) -: 592: endif -: 593:! -: 594: else -: 595:! -: 596:! buckling matrix -: 597:! -: 598: senergyb= -: 599: & (s11b*w(1,1)+s12b*(w(1,2)+w(2,1)) -: 600: & +s13b*(w(1,3)+w(3,1))+s22b*w(2,2) #####: 601: & +s23b*(w(2,3)+w(3,2))+s33b*w(3,3))*weight #####: 602: sm(ii1,jj1)=sm(ii1,jj1)-senergyb #####: 603: sm(ii1+1,jj1+1)=sm(ii1+1,jj1+1)-senergyb #####: 604: sm(ii1+2,jj1+2)=sm(ii1+2,jj1+2)-senergyb -: 605:! -: 606: endif -: 607:! #####: 608: ii1=ii1+3 -: 609: enddo #####: 610: jj1=jj1+3 -: 611: enddo -: 612: else -: 613:! -: 614:! stiffness matrix for static and modal -: 615:! 2nd order calculations -: 616:! -: 617:! large displacement stiffness -: 618:! 11M: 619: do i1=1,3 36M: 620: do j1=1,3 25M: 621: vo(i1,j1)=0.d0 532M: 622: do k1=1,nope 523M: 623: vo(i1,j1)=vo(i1,j1)+shp(j1,k1)*voldl(i1,k1) -: 624: enddo -: 625: enddo -: 626: enddo -: 627:! 3M: 628: if(mattyp.eq.1) then 1M: 629: call wcoef(v,vo,al,um) -: 630: endif -: 631:! -: 632:! calculating the total mass of the element for -: 633:! lumping purposes: only for explicit nonlinear -: 634:! dynamic calculations -: 635:! 3M*: 636: if(mass.and.(iexpl.eq.1)) then #####: 637: summass=summass+rho*xsj -: 638: endif -: 639:! 3M: 640: jj1=1 58M: 641: do jj=1,nope -: 642:! 55M: 643: ii1=1 637M: 644: do ii=1,jj -: 645:! -: 646:! all products of the shape functions for a given ii -: 647:! and jj -: 648:! 2G: 649: do i1=1,3 8G: 650: do j1=1,3 7G: 651: w(i1,j1)=shpj(i1,ii)*shpj(j1,jj) -: 652: enddo -: 653: enddo -: 654:! 582M: 655: if(mattyp.eq.1) then -: 656:! 1G: 657: do m1=1,3 4G: 658: do m2=1,3 11G: 659: do m3=1,3 33G: 660: do m4=1,3 -: 661: s(ii1+m2-1,jj1+m1-1)= -: 662: & s(ii1+m2-1,jj1+m1-1) 31G: 663: & +v(m4,m3,m2,m1)*w(m4,m3)*weight -: 664: enddo -: 665: enddo -: 666: enddo -: 667: enddo -: 668:! 299M: 669: elseif(mattyp.eq.2) then -: 670:! 3M: 671: call orthonl(w,vo,elas,s,ii1,jj1,weight) -: 672:! -: 673: else -: 674:! 1G: 675: do i1=1,3 889M: 676: iii1=ii1+i1-1 4G: 677: do j1=1,3 3G: 678: jjj1=jj1+j1-1 12G: 679: do k1=1,3 35G: 680: do l1=1,3 -: 681: s(iii1,jjj1)=s(iii1,jjj1) 24G: 682: & +anisox(i1,k1,j1,l1)*w(k1,l1)*weight 104G: 683: do m1=1,3 -: 684: s(iii1,jjj1)=s(iii1,jjj1) -: 685: & +anisox(i1,k1,m1,l1)*w(k1,l1) -: 686: & *vo(j1,m1)*weight -: 687: & +anisox(m1,k1,j1,l1)*w(k1,l1) 72G: 688: & *vo(i1,m1)*weight 312G: 689: do n1=1,3 -: 690: s(iii1,jjj1)=s(iii1,jjj1) -: 691: & +anisox(m1,k1,n1,l1) -: 692: & *w(k1,l1)*vo(i1,m1)*vo(j1,n1) 288G: 693: & *weight -: 694: enddo -: 695: enddo -: 696: enddo -: 697: enddo -: 698: enddo -: 699: enddo -: 700:!SPEC: The immediately preceding loop nest is also available in -: 701:!SPEC: program-generated (much longer) form from the author's -: 702:!SPEC: website (see 454.calculix/Docs) in file anisonl.f -: 703:!SPEC: -: 704:!SPEC: call anisonl(w,vo,elas,s,ii1,jj1,weight) -: 705:!SPEC: -: 706: endif -: 707:! -: 708:! stress stiffness -: 709:! -: 710: senergy= The hottest part are line 689-693 which execute 300G times in reference run. Because of the code is not executed in train run we optimize it for size.