https://gcc.gnu.org/bugzilla/show_bug.cgi?id=71088
Bug ID: 71088 Summary: [i386, AVX-512, Perf] vpermi2ps instead of vpermps emitted Product: gcc Version: 7.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: target Assignee: unassigned at gcc dot gnu.org Reporter: iverbin at gcc dot gnu.org CC: ienkovich at gcc dot gnu.org, izamyatin at gmail dot com, kyukhin at gcc dot gnu.org, ubizjak at gmail dot com Target Milestone: --- Testcase: float foo (float *arr1, float *arr2, float *max_x, int M, float s) { float *res = new float[M]; for (int i = M - 1; i >= 0; i--) for (int j = 0; j <= i; j++) { float x = arr1[j] * arr2[i - j] + s; res[j] = x > max_x[j] ? x : max_x[j]; } return res[0]; } To reproduce: $ g++ -S test.cpp -Ofast -funroll-loops -march=knl GCC emits vpermi2ps instruction to rearrange elements of arr2 backwards, however this instruction writes the result into the index register, therefore there are additional movs before each vpermi2ps to restore indexes [1]. Also there are some weird movs after each vpermi2ps [2]. It's not clear why the result from vpermi2ps isn't passed directly to vfmadd132ps. .L1: vmovups (%r11), %zmm9 vmovdqa64 %zmm2, %zmm1 # [1] vpermi2ps %zmm9, %zmm9, %zmm1 vmovdqa64 %zmm2, %zmm16 # [1] vmovaps %zmm1, %zmm10 # [2] vmovdqa64 %zmm2, %zmm1 # [1] vmovups -64(%r11), %zmm12 vfmadd132ps (%rax,%r9), %zmm3, %zmm10 vpermi2ps %zmm12, %zmm12, %zmm1 vmaxps (%rcx,%r9), %zmm10, %zmm11 vmovaps %zmm1, %zmm13 # [2] vmovdqa64 %zmm2, %zmm1 # [1] vmovups -128(%r11), %zmm15 vfmadd132ps 64(%rax,%r9), %zmm3, %zmm13 vmovups -192(%r11), %zmm6 vpermi2ps %zmm15, %zmm15, %zmm1 vpermi2ps %zmm6, %zmm6, %zmm16 vmovaps %zmm1, %zmm4 # [2] vmovaps %zmm16, %zmm7 # [2] vmaxps 64(%rcx,%r9), %zmm13, %zmm14 vfmadd132ps 128(%rax,%r9), %zmm3, %zmm4 vfmadd132ps 192(%rax,%r9), %zmm3, %zmm7 vmaxps 128(%rcx,%r9), %zmm4, %zmm5 leal 4(%r15), %r15d vmaxps 192(%rcx,%r9), %zmm7, %zmm8 cmpl %esi, %r15d vmovups %zmm11, (%r8,%r9) leaq -256(%r11), %r11 vmovups %zmm14, 64(%r8,%r9) vmovups %zmm5, 128(%r8,%r9) vmovups %zmm8, 192(%r8,%r9) leaq 256(%r9), %r9 jb .L1 Instead of this, vpermps can be used. It doesn't overwrite the index register, what allows to get rid of 8 movs in this loop: .L2: lea (,%r12,4), %r10 negq %r10 addq %rbx, %r10 vpermps -64(%r10), %zmm3, %zmm4 vpermps -128(%r10), %zmm3, %zmm6 vpermps -192(%r10), %zmm3, %zmm8 vpermps -256(%r10), %zmm3, %zmm10 vfmadd132ps (%r11,%r12,4), %zmm2, %zmm4 vfmadd132ps 64(%r11,%r12,4), %zmm2, %zmm6 vfmadd132ps 128(%r11,%r12,4), %zmm2, %zmm8 vfmadd132ps 192(%r11,%r12,4), %zmm2, %zmm10 vmaxps (%r13,%r12,4), %zmm4, %zmm5 vmovups %zmm5, (%rdi,%r12,4) vmaxps 64(%r13,%r12,4), %zmm6, %zmm7 vmovups %zmm7, 64(%rdi,%r12,4) vmaxps 128(%r13,%r12,4), %zmm8, %zmm9 vmovups %zmm9, 128(%rdi,%r12,4) vmaxps 192(%r13,%r12,4), %zmm10, %zmm11 vmovups %zmm11, 192(%rdi,%r12,4) addq $64, %r12 cmpq %rax, %r12 jb .L2