[llvm-bugs] [Bug 47558] New: The number of SIMD loads increases unnecessarily

via llvm-bugs Wed, 16 Sep 2020 23:56:40 -0700

https://bugs.llvm.org/show_bug.cgi?id=47558


            Bug ID: 47558
           Summary: The number of SIMD loads increases unnecessarily
           Product: libraries
           Version: trunk
          Hardware: PC
                OS: Linux
            Status: NEW
          Severity: enhancement
          Priority: P
         Component: Scalar Optimizations
          Assignee: unassignedb...@nondot.org
          Reporter: k...@google.com
                CC: llvm-bugs@lists.llvm.org

This patch:

https://github.com/llvm/llvm-project/commit/8fb055932c085da21f3b721995a06f42006744bd

increases the number of memory loads in certain cases.

Consider:

target triple = "x86_64-unknown-linux-gnu"

declare dso_local float* @getscaleptr() #0

define void @foo(<2 x float>* nonnull %resultptr, <2 x float>* nonnull %opptr)
{
  %scaleptr = call nonnull align 16 dereferenceable(64) float* @getscaleptr()
  %op = load <2 x float>, <2 x float>* %opptr, align 4
  %scale = load float, float* %scaleptr, align 16

  %op0 = extractelement <2 x float> %op, i32 0
  %product0 = fmul float %op0, %scale
  %result0 = insertelement <2 x float> undef, float %product0, i32 0

  %op1 = extractelement <2 x float> %op, i32 1
  %product1 = fmul float %op1, %scale
  %result1 = insertelement <2 x float> %result0, float %product1, i32 1

  store <2 x float> %result1, <2 x float>* %resultptr, align 8
  ret void
}

This testcase multiplies a <2 x float> value by a scalar float value
and stores the result back to memory.

Compile like so:

$ clang -O2 -msse4.2 -S bug.ll -o bug.s

Then with and without the patch, I get the following assembly diff:

        pushq   %r14
        pushq   %rbx
        pushq   %rax
        movq    %rsi, %rbx
        movq    %rdi, %r14
        callq   getscaleptr
        movsd   (%rbx), %xmm0                   # xmm0 = mem[0],zero
-       movss   (%rax), %xmm1                   # xmm1 = mem[0],zero,zero,zero
-       movsldup        %xmm1, %xmm1                    # xmm1 = xmm1[0,0,2,2]
+       movaps  (%rax), %xmm1
+       insertps        $16, (%rax), %xmm1              # xmm1 =
xmm1[0],mem[0],xmm1[2,3]
        mulps   %xmm0, %xmm1
        movlps  %xmm1, (%r14)
        addq    $8, %rsp
        popq    %rbx
        popq    %r14
        retq

Note that the patch replaces movsldup with insertps, which reads from
the same location as movaps, increasing the number of loads.

Here is the "IR Dump After Optimize scalar/vector ops".

Without the patch:

  %scaleptr = tail call nonnull align 16 dereferenceable(64) float*
@getscaleptr()
  %op = load <2 x float>, <2 x float>* %opptr, align 4
  %scale = load float, float* %scaleptr, align 16
  %1 = insertelement <2 x float> undef, float %scale, i32 0
  %2 = insertelement <2 x float> %1, float %scale, i32 1
  %3 = fmul <2 x float> %op, %2
  %4 = extractelement <2 x float> %3, i32 0
  %result0 = insertelement <2 x float> undef, float %4, i32 0
  %5 = extractelement <2 x float> %3, i32 1
  %result1 = insertelement <2 x float> %result0, float %5, i32 1
  store <2 x float> %result1, <2 x float>* %resultptr, align 8
  ret void

With the patch:

  %scaleptr = tail call nonnull align 16 dereferenceable(64) float*
@getscaleptr()
  %op = load <2 x float>, <2 x float>* %opptr, align 4
  %1 = bitcast float* %scaleptr to <4 x float>*
  %2 = load <4 x float>, <4 x float>* %1, align 16
  %3 = shufflevector <4 x float> %2, <4 x float> undef, <2 x i32> <i32 0, i32
1>
  %scale = load float, float* %scaleptr, align 16
  %4 = insertelement <2 x float> %3, float %scale, i32 1
  %5 = fmul <2 x float> %op, %4
  %6 = extractelement <2 x float> %5, i32 0
  %result0 = insertelement <2 x float> undef, float %6, i32 0
  %7 = extractelement <2 x float> %5, i32 1
  %result1 = insertelement <2 x float> %result0, float %7, i32 1
  store <2 x float> %result1, <2 x float>* %resultptr, align 8
  ret void

Notice the three loads with the patch.

Here is the final LLVM IR.

Without the patch:

  %scaleptr = tail call nonnull align 16 dereferenceable(64) float*
@getscaleptr()
  %op = load <2 x float>, <2 x float>* %opptr, align 4
  %scale = load float, float* %scaleptr, align 16
  %1 = insertelement <2 x float> undef, float %scale, i32 0
  %2 = shufflevector <2 x float> %1, <2 x float> undef, <2 x i32>
zeroinitializer
  %3 = fmul <2 x float> %op, %2
  store <2 x float> %3, <2 x float>* %resultptr, align 8
  ret void

With the patch:

  %scaleptr = tail call nonnull align 16 dereferenceable(64) float*
@getscaleptr()
  %op = load <2 x float>, <2 x float>* %opptr, align 4
  %1 = bitcast float* %scaleptr to <4 x float>*
  %2 = load <4 x float>, <4 x float>* %1, align 16
  %3 = shufflevector <4 x float> %2, <4 x float> undef, <2 x i32> <i32 0, i32
undef>
  %scale = load float, float* %scaleptr, align 16
  %4 = insertelement <2 x float> %3, float %scale, i32 1
  %5 = fmul <2 x float> %op, %4
  store <2 x float> %5, <2 x float>* %resultptr, align 8
  ret void

-- 
You are receiving this mail because:
You are on the CC list for the bug.

_______________________________________________
llvm-bugs mailing list
llvm-bugs@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs

[llvm-bugs] [Bug 47558] New: The number of SIMD loads increases unnecessarily

Reply via email to