https://gcc.gnu.org/bugzilla/show_bug.cgi?id=66119
Bug ID: 66119 Summary: Regression in optimization of avx-code Product: gcc Version: 5.1.1 Status: UNCONFIRMED Severity: normal Priority: P3 Component: c++ Assignee: unassigned at gcc dot gnu.org Reporter: joachim.schoeberl at tuwien dot ac.at Target Milestone: --- Created attachment 35525 --> https://gcc.gnu.org/bugzilla/attachment.cgi?id=35525&action=edit testcode gcc 5.1 produces a lot of scalar moves for the attached vector-class code. gcc 4.9 generates compact code (see below). compiled using: gcc -O3 -mavx -S -std=c++11 testgcc.cpp compiler version: gcc (GCC) 5.1.1 20150505 gcc 5.1 works fine in any of the cases: - we use a manual copy constructor instead of '= default' (line 37): MyTSIMD (const MyTSIMD & s2) : data(s2.data) { ; } - we use the concrete vector-class instead of the template (line 45): using MyVec = MyAVX; - we do not use __attribute__ ((__always_inline__)) for ComputeSomething (line 58) Cheers, Joachim code generated by gcc5.1: .globl _Z12TestFunction4Vec2S_ .type _Z12TestFunction4Vec2S_, @function _Z12TestFunction4Vec2S_: .LFB4604: .cfi_startproc movq 72(%rsp), %rdx vmovapd 40(%rsp), %ymm0 movq %rdi, %rax vmovapd 8(%rsp), %ymm1 movq %rdx, -88(%rsp) movq 80(%rsp), %rdx movq %rdx, -80(%rsp) movq 88(%rsp), %rdx movq %rdx, -72(%rsp) movq 96(%rsp), %rdx movq %rdx, -64(%rsp) movq 104(%rsp), %rdx vaddpd -88(%rsp), %ymm1, %ymm1 movq %rdx, -56(%rsp) movq 112(%rsp), %rdx movq %rdx, -48(%rsp) movq 120(%rsp), %rdx vmovapd %ymm1, (%rdi) movq %rdx, -40(%rsp) movq 128(%rsp), %rdx movq %rdx, -32(%rsp) vaddpd -56(%rsp), %ymm0, %ymm0 vmovapd %ymm0, 32(%rdi) vzeroupper ret .cfi_endproc code generated by gcc 4.9.2: .type _Z12TestFunction4Vec2S_, @function _Z12TestFunction4Vec2S_: .LFB2234: .cfi_startproc vmovapd 40(%rsp), %ymm0 movq %rdi, %rax vmovapd 8(%rsp), %ymm1 vaddpd 104(%rsp), %ymm0, %ymm0 vaddpd 72(%rsp), %ymm1, %ymm1 vmovapd %ymm0, 32(%rdi) vmovapd %ymm1, (%rdi) vzeroupper ret .cfi_endproc