https://bugs.llvm.org/show_bug.cgi?id=33434
Bug ID: 33434
Summary: 265 bit double shuffles not optimal
Product: libraries
Version: trunk
Hardware: PC
OS: Linux
Status: NEW
Severity: enhancement
Priority: P
Component: Backend: X86
Assignee: unassignedb...@nondot.org
Reporter: tob...@grosser.es
CC: llvm-bugs@lists.llvm.org
Hi,
I just tried to generate AVX2 code for some 256 bit AVX2 double shuffles, but
despite Chandler's outstanding work on improving X86 shuffles two years ago,
the shuffle sequences seem not be be optimal (using llc out.ll -o -
-mcpu=x86-64 -mattr=+avx2 on r304555).
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
define void @test_0(<4 x double>* %PA, <4 x double>* %PB) {
entry:
%A = load <4 x double>, <4 x double>* %PA
%B = load <4 x double>, <4 x double>* %PB
%SA = shufflevector <4 x double> %A, <4 x double> %B, <4 x i32> <i32 0, i32
4, i32 2, i32 3>
%SB = shufflevector <4 x double> %A, <4 x double> %B, <4 x i32> <i32 1, i32
5, i32 6, i32 7>
; vmovddup %xmm1, %xmm2 # xmm2 = xmm1[0,0]
; vblendpd $2, %ymm2, %ymm0, %ymm2 # ymm2 =
ymm0[0],ymm2[1],ymm0[2,3]
; vpermilpd $1, %xmm0, %xmm0 # xmm0 = xmm0[1,0]
; vblendpd $1, %ymm0, %ymm1, %ymm0 # ymm0 = ymm0[0],ymm1[1,2,3]
store <4 x double> %SA, <4 x double>* %PA
store <4 x double> %SB, <4 x double>* %PB
ret void
}
define void @test_1(<4 x double>* %PA, <4 x double>* %PB) {
entry:
%A = load <4 x double>, <4 x double>* %PA
%B = load <4 x double>, <4 x double>* %PB
%SA = shufflevector <4 x double> %A, <4 x double> %B, <4 x i32> <i32 4, i32
5, i32 0, i32 6>
%SB = shufflevector <4 x double> %A, <4 x double> %B, <4 x i32> <i32 2, i32
3, i32 1, i32 7>
; vinsertf128 $1, %xmm0, %ymm0, %ymm2
; vpermilpd $2, %ymm1, %ymm3 # ymm3 = ymm1[0,1,2,2]
; vblendpd $4, %ymm2, %ymm3, %ymm2 # ymm2 =
ymm3[0,1],ymm2[2],ymm3[3]
; vpermpd $222, %ymm0, %ymm0 # ymm0 = ymm0[2,3,1,3]
; vblendpd $8, %ymm1, %ymm0, %ymm0 # ymm0 = ymm0[0,1,2],ymm1[3]
store <4 x double> %SA, <4 x double>* %PA
store <4 x double> %SB, <4 x double>* %PB
ret void
}
define void @test_2(<4 x double>* %PA, <4 x double>* %PB) {
entry:
%A = load <4 x double>, <4 x double>* %PA
%B = load <4 x double>, <4 x double>* %PB
%SA = shufflevector <4 x double> %A, <4 x double> %B, <4 x i32> <i32 0, i32
1, i32 4, i32 5>
%SB = shufflevector <4 x double> %A, <4 x double> %B, <4 x i32> <i32 2, i32
3, i32 6, i32 7>
; vinsertf128 $1, %xmm1, %ymm0, %ymm2
; vperm2f128 $49, %ymm1, %ymm0, %ymm0 # ymm0 = ymm0[2,3],ymm1[2,3]
store <4 x double> %SA, <4 x double>* %PA
store <4 x double> %SB, <4 x double>* %PB
ret void
}
Am I missing something or could these really be translated to at most two
vblendpd instructions?
Best,
Tobias
--
You are receiving this mail because:
You are on the CC list for the bug.
_______________________________________________
llvm-bugs mailing list
llvm-bugs@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs