[llvm-bugs] [Bug 28002] New: Performance regression summing small float array

via llvm-bugs Sat, 04 Jun 2016 10:50:09 -0700

https://llvm.org/bugs/show_bug.cgi?id=28002


            Bug ID: 28002
           Summary: Performance regression summing small float array
           Product: libraries
           Version: 3.8
          Hardware: PC
                OS: Linux
            Status: NEW
          Severity: normal
          Priority: P
         Component: Backend: X86
          Assignee: unassignedb...@nondot.org
          Reporter: yyc1...@gmail.com
                CC: llvm-bugs@lists.llvm.org
    Classification: Unclassified

The following loop takes 20-60% (depending on the machine) longer to run when
compiled with clang 3.8+ compare to 3.7.1 (optimization level `-Ofast`,
`-march=core-avx2`) for a cacheline aligned array of 1024 floats. When the
array is much larger so that it doesn't fit in the cache and the loop is memory
bandwidth limited there's not performance difference anymore. (FWIW, somehow
both are much faster than GCC 6.1...)

```c
__attribute__((noinline)) float sum32(float *a, size_t n)
{
    /* a = (float*)__builtin_assume_aligned(a, 64); */
    float s = 0;
    for (size_t i = 0;i < n;i++)
        s += a[i];
    return s;
}
```

The C code, llvm ir and assembly output on 3.7 and 3.8 are available in [this
gist](https://gist.github.com/yuyichao/5b07f71c1f19248ec5511d758532a4b0) The
difference in assembly is also pasted below.


```diff
--- llvm37.s    2016-06-04 13:23:34.947819989 -0400
+++ llvm38.s    2016-06-04 13:14:25.455283889 -0400
@@ -4,103 +4,95 @@
         vxorps  %xmm0, %xmm0, %xmm0
         testq   %rsi, %rsi
         je      .LBB1_13
-# BB#1:                                 # %overflow.checked
+# BB#1:                                 # %.lr.ph.preheader
+        vxorps  %xmm0, %xmm0, %xmm0
+        xorl    %ecx, %ecx
+        cmpq    $15, %rsi
+        jbe     .LBB1_2
+# BB#4:                                 # %min.iters.checked
         xorl    %ecx, %ecx
         movq    %rsi, %rax
-        vxorps  %ymm0, %ymm0, %ymm0
-        vxorps  %ymm1, %ymm1, %ymm1
-        vxorps  %ymm2, %ymm2, %ymm2
-        vxorps  %ymm3, %ymm3, %ymm3
-        andq    $-32, %rax
-        je      .LBB1_10
-# BB#2:                                 # %vector.body.preheader
-        leaq    -32(%rsi), %r8
+        andq    $-16, %rax
+        je      .LBB1_2
+# BB#5:                                 # %vector.body.preheader
+        leaq    -16(%rsi), %r8
         movl    %r8d, %ecx
-        shrl    $5, %ecx
+        shrl    $4, %ecx
         addl    $1, %ecx
         xorl    %edx, %edx
-        testb   $3, %cl
-        je      .LBB1_3
-# BB#4:                                 # %vector.body.prol.preheader
-        leal    -32(%rsi), %ecx
-        shrl    $5, %ecx
+        testb   $7, %cl
+        je      .LBB1_6
+# BB#7:                                 # %vector.body.prol.preheader
+        leal    -16(%rsi), %ecx
+        shrl    $4, %ecx
         addl    $1, %ecx
-        andl    $3, %ecx
+        andl    $7, %ecx
         negq    %rcx
         vxorps  %ymm0, %ymm0, %ymm0
         xorl    %edx, %edx
         vxorps  %ymm1, %ymm1, %ymm1
-        vxorps  %ymm2, %ymm2, %ymm2
-        vxorps  %ymm3, %ymm3, %ymm3
         .align  16, 0x90
-.LBB1_5:                                # %vector.body.prol
+.LBB1_8:                                # %vector.body.prol
                                         # =>This Inner Loop Header: Depth=1
         vaddps  (%rdi,%rdx,4), %ymm0, %ymm0
         vaddps  32(%rdi,%rdx,4), %ymm1, %ymm1
-        vaddps  64(%rdi,%rdx,4), %ymm2, %ymm2
-        vaddps  96(%rdi,%rdx,4), %ymm3, %ymm3
-        addq    $32, %rdx
+        addq    $16, %rdx
         addq    $1, %rcx
-        jne     .LBB1_5
-        jmp     .LBB1_6
-.LBB1_3:
+        jne     .LBB1_8
+        jmp     .LBB1_9
+.LBB1_6:
         vxorps  %ymm0, %ymm0, %ymm0
         vxorps  %ymm1, %ymm1, %ymm1
-        vxorps  %ymm2, %ymm2, %ymm2
-        vxorps  %ymm3, %ymm3, %ymm3
-.LBB1_6:                                # %vector.body.preheader.split
-        cmpq    $96, %r8
-        jb      .LBB1_9
-# BB#7:                                 # %vector.body.preheader.split.split
+.LBB1_9:                                # %vector.body.preheader.split
+        cmpq    $112, %r8
+        jb      .LBB1_12
+# BB#10:                                # %vector.body.preheader.split.split
         movq    %rsi, %rcx
-        andq    $-32, %rcx
+        andq    $-16, %rcx
         subq    %rdx, %rcx
         leaq    480(%rdi,%rdx,4), %rdx
         .align  16, 0x90
-.LBB1_8:                                # %vector.body
+.LBB1_11:                               # %vector.body
                                         # =>This Inner Loop Header: Depth=1
         vaddps  -480(%rdx), %ymm0, %ymm0
         vaddps  -448(%rdx), %ymm1, %ymm1
-        vaddps  -416(%rdx), %ymm2, %ymm2
-        vaddps  -384(%rdx), %ymm3, %ymm3
+        vaddps  -416(%rdx), %ymm0, %ymm0
+        vaddps  -384(%rdx), %ymm1, %ymm1
         vaddps  -352(%rdx), %ymm0, %ymm0
         vaddps  -320(%rdx), %ymm1, %ymm1
-        vaddps  -288(%rdx), %ymm2, %ymm2
-        vaddps  -256(%rdx), %ymm3, %ymm3
+        vaddps  -288(%rdx), %ymm0, %ymm0
+        vaddps  -256(%rdx), %ymm1, %ymm1
         vaddps  -224(%rdx), %ymm0, %ymm0
         vaddps  -192(%rdx), %ymm1, %ymm1
-        vaddps  -160(%rdx), %ymm2, %ymm2
-        vaddps  -128(%rdx), %ymm3, %ymm3
+        vaddps  -160(%rdx), %ymm0, %ymm0
+        vaddps  -128(%rdx), %ymm1, %ymm1
         vaddps  -96(%rdx), %ymm0, %ymm0
         vaddps  -64(%rdx), %ymm1, %ymm1
-        vaddps  -32(%rdx), %ymm2, %ymm2
-        vaddps  (%rdx), %ymm3, %ymm3
+        vaddps  -32(%rdx), %ymm0, %ymm0
+        vaddps  (%rdx), %ymm1, %ymm1
         addq    $512, %rdx              # imm = 0x200
         addq    $-128, %rcx
-        jne     .LBB1_8
-.LBB1_9:
-        movq    %rax, %rcx
-.LBB1_10:                               # %middle.block
+        jne     .LBB1_11
+.LBB1_12:                               # %middle.block
         vaddps  %ymm0, %ymm1, %ymm0
-        vaddps  %ymm0, %ymm2, %ymm0
-        vaddps  %ymm0, %ymm3, %ymm0
         vextractf128    $1, %ymm0, %xmm1
         vaddps  %ymm1, %ymm0, %ymm0
-        vpermilpd       $1, %ymm0, %ymm1 # ymm1 = ymm0[1,0,2,2]
+        vpermilpd       $1, %xmm0, %xmm1 # xmm1 = xmm0[1,0]
         vaddps  %ymm1, %ymm0, %ymm0
         vhaddps %ymm0, %ymm0, %ymm0
-        cmpq    %rsi, %rcx
+        movq    %rax, %rcx
+        cmpq    %rsi, %rax
         je      .LBB1_13
-# BB#11:                                # %.lr.ph.preheader
+.LBB1_2:                                # %.lr.ph.preheader13
         leaq    (%rdi,%rcx,4), %rax
         subq    %rcx, %rsi
         .align  16, 0x90
-.LBB1_12:                               # %.lr.ph
+.LBB1_3:                                # %.lr.ph
                                         # =>This Inner Loop Header: Depth=1
         vaddss  (%rax), %xmm0, %xmm0
         addq    $4, %rax
         addq    $-1, %rsi
-        jne     .LBB1_12
+        jne     .LBB1_3
 .LBB1_13:                               # %._crit_edge
         #APP
         #NO_APP
```

-- 
You are receiving this mail because:
You are on the CC list for the bug.

_______________________________________________
llvm-bugs mailing list
llvm-bugs@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs

[llvm-bugs] [Bug 28002] New: Performance regression summing small float array

Reply via email to