https://gcc.gnu.org/bugzilla/show_bug.cgi?id=71088

            Bug ID: 71088
           Summary: [i386, AVX-512, Perf] vpermi2ps instead of vpermps
                    emitted
           Product: gcc
           Version: 7.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: target
          Assignee: unassigned at gcc dot gnu.org
          Reporter: iverbin at gcc dot gnu.org
                CC: ienkovich at gcc dot gnu.org, izamyatin at gmail dot com,
                    kyukhin at gcc dot gnu.org, ubizjak at gmail dot com
  Target Milestone: ---

Testcase:

float foo (float *arr1, float *arr2, float *max_x, int M, float s)
{
  float *res = new float[M];

  for (int i = M - 1; i >= 0; i--)
    for (int j = 0; j <= i; j++)
      {
        float x = arr1[j] * arr2[i - j] + s;
        res[j] = x > max_x[j] ? x : max_x[j];
      }

  return res[0];
}

To reproduce:

$ g++ -S test.cpp -Ofast -funroll-loops -march=knl

GCC emits vpermi2ps instruction to rearrange elements of arr2 backwards,
however
this instruction writes the result into the index register, therefore there are
additional movs before each vpermi2ps to restore indexes [1].
Also there are some weird movs after each vpermi2ps [2].  It's not clear why
the
result from vpermi2ps isn't passed directly to vfmadd132ps.

.L1:
        vmovups       (%r11), %zmm9
        vmovdqa64     %zmm2, %zmm1                  # [1]
        vpermi2ps     %zmm9, %zmm9, %zmm1
        vmovdqa64     %zmm2, %zmm16                 # [1]
        vmovaps       %zmm1, %zmm10                 # [2]
        vmovdqa64     %zmm2, %zmm1                  # [1]
        vmovups       -64(%r11), %zmm12
        vfmadd132ps   (%rax,%r9), %zmm3, %zmm10
        vpermi2ps     %zmm12, %zmm12, %zmm1
        vmaxps        (%rcx,%r9), %zmm10, %zmm11
        vmovaps       %zmm1, %zmm13                 # [2]
        vmovdqa64     %zmm2, %zmm1                  # [1]
        vmovups       -128(%r11), %zmm15
        vfmadd132ps   64(%rax,%r9), %zmm3, %zmm13
        vmovups       -192(%r11), %zmm6
        vpermi2ps     %zmm15, %zmm15, %zmm1
        vpermi2ps     %zmm6, %zmm6, %zmm16
        vmovaps       %zmm1, %zmm4                  # [2]
        vmovaps       %zmm16, %zmm7                 # [2]
        vmaxps        64(%rcx,%r9), %zmm13, %zmm14
        vfmadd132ps   128(%rax,%r9), %zmm3, %zmm4
        vfmadd132ps   192(%rax,%r9), %zmm3, %zmm7
        vmaxps        128(%rcx,%r9), %zmm4, %zmm5
        leal          4(%r15), %r15d
        vmaxps        192(%rcx,%r9), %zmm7, %zmm8
        cmpl          %esi, %r15d
        vmovups       %zmm11, (%r8,%r9)
        leaq          -256(%r11), %r11
        vmovups       %zmm14, 64(%r8,%r9)
        vmovups       %zmm5, 128(%r8,%r9)
        vmovups       %zmm8, 192(%r8,%r9)
        leaq          256(%r9), %r9
        jb            .L1

Instead of this, vpermps can be used.  It doesn't overwrite the index register,
what allows to get rid of 8 movs in this loop:

.L2:
        lea           (,%r12,4), %r10
        negq          %r10
        addq          %rbx, %r10
        vpermps       -64(%r10), %zmm3, %zmm4
        vpermps       -128(%r10), %zmm3, %zmm6
        vpermps       -192(%r10), %zmm3, %zmm8
        vpermps       -256(%r10), %zmm3, %zmm10
        vfmadd132ps   (%r11,%r12,4), %zmm2, %zmm4
        vfmadd132ps   64(%r11,%r12,4), %zmm2, %zmm6
        vfmadd132ps   128(%r11,%r12,4), %zmm2, %zmm8
        vfmadd132ps   192(%r11,%r12,4), %zmm2, %zmm10
        vmaxps        (%r13,%r12,4), %zmm4, %zmm5
        vmovups       %zmm5, (%rdi,%r12,4)
        vmaxps        64(%r13,%r12,4), %zmm6, %zmm7
        vmovups       %zmm7, 64(%rdi,%r12,4)
        vmaxps        128(%r13,%r12,4), %zmm8, %zmm9
        vmovups       %zmm9, 128(%rdi,%r12,4)
        vmaxps        192(%r13,%r12,4), %zmm10, %zmm11
        vmovups       %zmm11, 192(%rdi,%r12,4)
        addq          $64, %r12
        cmpq          %rax, %r12
        jb            .L2

Reply via email to