https://gcc.gnu.org/bugzilla/show_bug.cgi?id=86029
Tavian Barnes <tavianator at gmail dot com> changed: What |Removed |Added ---------------------------------------------------------------------------- CC| |tavianator at gmail dot com --- Comment #1 from Tavian Barnes <tavianator at gmail dot com> --- Maybe a dupe of https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70291? In the -O3 version, __mulsc3() dominates the profile. │ for(int i=0; i<=decimate_taps_length; i++) decim += samplebuf[i] * decimate_taps[i]; 0.20 │430:┌─→vmovss 0x4(%r13,%rbx,1),%xmm1 3.63 │ │ vmovss 0x0(%r13,%rbx,1),%xmm0 12.35 │ │ vmovss 0x4(%r12,%rbx,1),%xmm3 0.31 │ │ vmovss (%r12,%rbx,1),%xmm2 0.02 │ │ add $0x8,%rbx 36.48 │ │→ callq __mulsc3 0.01 │ │ vmovss -0x78(%rbp),%xmm6 0.00 │ │ vmovss -0x80(%rbp),%xmm4 23.70 │ │ vmovq %xmm0,-0x68(%rbp) 14.25 │ │ vaddss -0x68(%rbp),%xmm6,%xmm5 1.54 │ │ vaddss -0x64(%rbp),%xmm4,%xmm7 0.48 │ │ vmovss %xmm5,-0x78(%rbp) 5.92 │ │ vmovss %xmm7,-0x80(%rbp) │ ├──cmp $0x2590,%rbx 0.01 │ └──jne 430 At -Ofast, │ for(int i=0; i<=decimate_taps_length; i++) decim += samplebuf[i] * decimate_taps[i]; 9.36 │5e0: vpermilps $0xf5,(%r12,%rax,1),%ymm0 15.56 │ vpermilps $0xa0,(%r12,%rax,1),%ymm1 11.24 │ vmulps (%rbx,%rax,1),%ymm0,%ymm0 17.55 │ vpermilps $0xb1,(%rbx,%rax,1),%ymm4 3.31 │ add $0x20,%rax 2.11 │ vmovaps %ymm1,%ymm3 6.62 │ vfmadd132ps %ymm4,%ymm0,%ymm3 3.79 │ vfmsub231ps %ymm4,%ymm1,%ymm0 2.91 │ vblendps $0xaa,%ymm0,%ymm3,%ymm0 10.75 │ vaddps %ymm0,%ymm6,%ymm6 │ cmp $0x2580,%rax 5.59 │ ↑ jne 5e0 0.01 │ vmovss 0x258c(%rbx),%xmm0 0.01 │ vmovss -0x70(%rbp),%xmm7 0.01 │ vmovss %xmm5,-0xd0(%rbp) 0.05 │ vextractf128 $0x1,%ymm6,%xmm3 0.01 │ vmovss 0x2588(%rbx),%xmm8 0.03 │ vshufps $0xff,%xmm3,%xmm3,%xmm13