http://gcc.gnu.org/bugzilla/show_bug.cgi?id=57858
--- Comment #2 from vincenzo Innocente <vincenzo.innocente at cern dot ch> --- actually the code for div and sqr is different already for standard SSE c++ -std=c++11 -Ofast -S avx2sqrt.cc -ftree-vectorizer-verbose=1 -Wall ; cat avx2sqrt.s .L2: movdqa %xmm0, %xmm1 addl $1, %eax movdqa %xmm0, %xmm4 cmpl $256, %eax paddd %xmm5, %xmm1 pshufd $238, %xmm1, %xmm0 cvtdq2pd %xmm1, %xmm1 movapd %xmm3, %xmm7 paddd %xmm6, %xmm4 cvtdq2pd %xmm0, %xmm0 divpd %xmm0, %xmm7 movapd %xmm7, %xmm0 movapd %xmm3, %xmm7 divpd %xmm1, %xmm7 addpd %xmm7, %xmm0 addpd %xmm0, %xmm2 jne .L3 movapd %xmm2, -24(%rsp) movsd -16(%rsp), %xmm0 addsd %xmm2, %xmm0 ret .cfi_endproc .LFE3: .size _Z3divv, .-_Z3divv .p2align 4,,15 .globl _Z3sqrv .type _Z3sqrv, @function _Z3sqrv: .LFB4: .cfi_startproc movl $1, %eax movsd .LC4(%rip), %xmm1 xorpd %xmm0, %xmm0 jmp .L6 .p2align 4,,10 .p2align 3 .L7: cvtsi2sd %eax, %xmm1 sqrtsd %xmm1, %xmm1 .L6: addl $1, %eax addsd %xmm1, %xmm0 cmpl $1025, %eax jne .L7 rep; ret .cfi_endproc