[llvm-bugs] [Bug 43460] New: [AArch64] Vector store of scalars produces sub-optimal code

via llvm-bugs Thu, 26 Sep 2019 04:31:44 -0700

https://bugs.llvm.org/show_bug.cgi?id=43460


            Bug ID: 43460
           Summary: [AArch64] Vector store of scalars produces sub-optimal
                    code
           Product: libraries
           Version: trunk
          Hardware: PC
                OS: All
            Status: NEW
          Severity: enhancement
          Priority: P
         Component: Backend: AArch64
          Assignee: unassignedb...@nondot.org
          Reporter: florian_h...@apple.com
                CC: arnaud.degrandmai...@arm.com,
                    llvm-bugs@lists.llvm.org, peter.sm...@linaro.org,
                    ties.st...@arm.com

It looks like we fail to generate optimal code when doing a vector store of a
vector of scalars. Consider the examples below (or on
https://godbolt.org/z/hpv4S9) . I am not sure how common those cases actually
are, I just stumbled across this while looking at
http://lists.llvm.org/pipermail/llvm-dev/2019-September/135432.html .


define void @const_vec(<2 x i32>* %c)  {
  store <2 x i32> <i32 2, i32 3>, <2 x i32>* %c, align 16
  ret void
}

define void @const_split(<4 x i32>* %c)  {
entry:
  %0 = getelementptr inbounds <4 x i32>, <4 x i32>* %c, i64 0, i64 0
  store i32 1, i32* %0, align 4
  %1 = getelementptr <4 x i32>, <4 x i32>* %c, i64 0, i64 1
  store i32 2, i32* %1, align 4
   ret void
}




With llc -O3 -mtriple=aarch64, we generate the assembly below. For the vector
version, we miss that we can use movk and instead load the constants from
memory.

.LCPI0_0:
  .word 2 // 0x2
  .word 3 // 0x3
const_vec: // @const_vec
  adrp x8, .LCPI0_0
  ldr d0, [x8, :lo12:.LCPI0_0]
  str d0, [x0]
  ret
const_split: // @const_split
  mov x8, #1
  movk x8, #2, lsl #32
  str x8, [x0]
  ret


For the case we store 2 arbitrary i32, we have an extra fmov and mov with the
vector version.


define void @var_vec_2(<2 x i32>* %c, i32 %a, i32 %b)  {
  %ins1 = insertelement <2 x i32> undef, i32 %a, i32 0
  %ins2 = insertelement <2 x i32> %ins1, i32 %b, i32 1
  store <2 x i32> %ins2, <2 x i32>* %c, align 16
  ret void
}

define void @var_split(<4 x i32>* %c, i32 %a, i32 %b)  {
entry:
  %0 = getelementptr inbounds <4 x i32>, <4 x i32>* %c, i64 0, i64 0
  store i32 %a, i32* %0, align 4
  %1 = getelementptr <4 x i32>, <4 x i32>* %c, i64 0, i64 1
  store i32 %b, i32* %1, align 4
   ret void
}

var_vec_2: // @var_vec_2
  fmov s0, w1
  mov v0.s[1], w2
  str d0, [x0]
  ret
var_split: // @var_split
  stp w1, w2, [x0]
  ret

-- 
You are receiving this mail because:
You are on the CC list for the bug.

_______________________________________________
llvm-bugs mailing list
llvm-bugs@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs

[llvm-bugs] [Bug 43460] New: [AArch64] Vector store of scalars produces sub-optimal code

Reply via email to