https://llvm.org/bugs/show_bug.cgi?id=28002
Bug ID: 28002 Summary: Performance regression summing small float array Product: libraries Version: 3.8 Hardware: PC OS: Linux Status: NEW Severity: normal Priority: P Component: Backend: X86 Assignee: unassignedb...@nondot.org Reporter: yyc1...@gmail.com CC: llvm-bugs@lists.llvm.org Classification: Unclassified The following loop takes 20-60% (depending on the machine) longer to run when compiled with clang 3.8+ compare to 3.7.1 (optimization level `-Ofast`, `-march=core-avx2`) for a cacheline aligned array of 1024 floats. When the array is much larger so that it doesn't fit in the cache and the loop is memory bandwidth limited there's not performance difference anymore. (FWIW, somehow both are much faster than GCC 6.1...) ```c __attribute__((noinline)) float sum32(float *a, size_t n) { /* a = (float*)__builtin_assume_aligned(a, 64); */ float s = 0; for (size_t i = 0;i < n;i++) s += a[i]; return s; } ``` The C code, llvm ir and assembly output on 3.7 and 3.8 are available in [this gist](https://gist.github.com/yuyichao/5b07f71c1f19248ec5511d758532a4b0) The difference in assembly is also pasted below. ```diff --- llvm37.s 2016-06-04 13:23:34.947819989 -0400 +++ llvm38.s 2016-06-04 13:14:25.455283889 -0400 @@ -4,103 +4,95 @@ vxorps %xmm0, %xmm0, %xmm0 testq %rsi, %rsi je .LBB1_13 -# BB#1: # %overflow.checked +# BB#1: # %.lr.ph.preheader + vxorps %xmm0, %xmm0, %xmm0 + xorl %ecx, %ecx + cmpq $15, %rsi + jbe .LBB1_2 +# BB#4: # %min.iters.checked xorl %ecx, %ecx movq %rsi, %rax - vxorps %ymm0, %ymm0, %ymm0 - vxorps %ymm1, %ymm1, %ymm1 - vxorps %ymm2, %ymm2, %ymm2 - vxorps %ymm3, %ymm3, %ymm3 - andq $-32, %rax - je .LBB1_10 -# BB#2: # %vector.body.preheader - leaq -32(%rsi), %r8 + andq $-16, %rax + je .LBB1_2 +# BB#5: # %vector.body.preheader + leaq -16(%rsi), %r8 movl %r8d, %ecx - shrl $5, %ecx + shrl $4, %ecx addl $1, %ecx xorl %edx, %edx - testb $3, %cl - je .LBB1_3 -# BB#4: # %vector.body.prol.preheader - leal -32(%rsi), %ecx - shrl $5, %ecx + testb $7, %cl + je .LBB1_6 +# BB#7: # %vector.body.prol.preheader + leal -16(%rsi), %ecx + shrl $4, %ecx addl $1, %ecx - andl $3, %ecx + andl $7, %ecx negq %rcx vxorps %ymm0, %ymm0, %ymm0 xorl %edx, %edx vxorps %ymm1, %ymm1, %ymm1 - vxorps %ymm2, %ymm2, %ymm2 - vxorps %ymm3, %ymm3, %ymm3 .align 16, 0x90 -.LBB1_5: # %vector.body.prol +.LBB1_8: # %vector.body.prol # =>This Inner Loop Header: Depth=1 vaddps (%rdi,%rdx,4), %ymm0, %ymm0 vaddps 32(%rdi,%rdx,4), %ymm1, %ymm1 - vaddps 64(%rdi,%rdx,4), %ymm2, %ymm2 - vaddps 96(%rdi,%rdx,4), %ymm3, %ymm3 - addq $32, %rdx + addq $16, %rdx addq $1, %rcx - jne .LBB1_5 - jmp .LBB1_6 -.LBB1_3: + jne .LBB1_8 + jmp .LBB1_9 +.LBB1_6: vxorps %ymm0, %ymm0, %ymm0 vxorps %ymm1, %ymm1, %ymm1 - vxorps %ymm2, %ymm2, %ymm2 - vxorps %ymm3, %ymm3, %ymm3 -.LBB1_6: # %vector.body.preheader.split - cmpq $96, %r8 - jb .LBB1_9 -# BB#7: # %vector.body.preheader.split.split +.LBB1_9: # %vector.body.preheader.split + cmpq $112, %r8 + jb .LBB1_12 +# BB#10: # %vector.body.preheader.split.split movq %rsi, %rcx - andq $-32, %rcx + andq $-16, %rcx subq %rdx, %rcx leaq 480(%rdi,%rdx,4), %rdx .align 16, 0x90 -.LBB1_8: # %vector.body +.LBB1_11: # %vector.body # =>This Inner Loop Header: Depth=1 vaddps -480(%rdx), %ymm0, %ymm0 vaddps -448(%rdx), %ymm1, %ymm1 - vaddps -416(%rdx), %ymm2, %ymm2 - vaddps -384(%rdx), %ymm3, %ymm3 + vaddps -416(%rdx), %ymm0, %ymm0 + vaddps -384(%rdx), %ymm1, %ymm1 vaddps -352(%rdx), %ymm0, %ymm0 vaddps -320(%rdx), %ymm1, %ymm1 - vaddps -288(%rdx), %ymm2, %ymm2 - vaddps -256(%rdx), %ymm3, %ymm3 + vaddps -288(%rdx), %ymm0, %ymm0 + vaddps -256(%rdx), %ymm1, %ymm1 vaddps -224(%rdx), %ymm0, %ymm0 vaddps -192(%rdx), %ymm1, %ymm1 - vaddps -160(%rdx), %ymm2, %ymm2 - vaddps -128(%rdx), %ymm3, %ymm3 + vaddps -160(%rdx), %ymm0, %ymm0 + vaddps -128(%rdx), %ymm1, %ymm1 vaddps -96(%rdx), %ymm0, %ymm0 vaddps -64(%rdx), %ymm1, %ymm1 - vaddps -32(%rdx), %ymm2, %ymm2 - vaddps (%rdx), %ymm3, %ymm3 + vaddps -32(%rdx), %ymm0, %ymm0 + vaddps (%rdx), %ymm1, %ymm1 addq $512, %rdx # imm = 0x200 addq $-128, %rcx - jne .LBB1_8 -.LBB1_9: - movq %rax, %rcx -.LBB1_10: # %middle.block + jne .LBB1_11 +.LBB1_12: # %middle.block vaddps %ymm0, %ymm1, %ymm0 - vaddps %ymm0, %ymm2, %ymm0 - vaddps %ymm0, %ymm3, %ymm0 vextractf128 $1, %ymm0, %xmm1 vaddps %ymm1, %ymm0, %ymm0 - vpermilpd $1, %ymm0, %ymm1 # ymm1 = ymm0[1,0,2,2] + vpermilpd $1, %xmm0, %xmm1 # xmm1 = xmm0[1,0] vaddps %ymm1, %ymm0, %ymm0 vhaddps %ymm0, %ymm0, %ymm0 - cmpq %rsi, %rcx + movq %rax, %rcx + cmpq %rsi, %rax je .LBB1_13 -# BB#11: # %.lr.ph.preheader +.LBB1_2: # %.lr.ph.preheader13 leaq (%rdi,%rcx,4), %rax subq %rcx, %rsi .align 16, 0x90 -.LBB1_12: # %.lr.ph +.LBB1_3: # %.lr.ph # =>This Inner Loop Header: Depth=1 vaddss (%rax), %xmm0, %xmm0 addq $4, %rax addq $-1, %rsi - jne .LBB1_12 + jne .LBB1_3 .LBB1_13: # %._crit_edge #APP #NO_APP ``` -- You are receiving this mail because: You are on the CC list for the bug.
_______________________________________________ llvm-bugs mailing list llvm-bugs@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs