https://gcc.gnu.org/bugzilla/show_bug.cgi?id=101579
Bug ID: 101579
Summary: Suboptimal codegen for __builtin_shufflevector
Product: gcc
Version: 12.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: target
Assignee: unassigned at gcc dot gnu.org
Reporter: hjl.tools at gmail dot com
CC: crazylht at gmail dot com
Target Milestone: ---
Target: i386,x86-64
For
---
typedef unsigned int __attribute__((__vector_size__ (32))) U;
typedef unsigned char __attribute__((__vector_size__ (64))) V;
V g;
U
foo (void)
{
V v = __builtin_shufflevector (g, g,
0, 1, 2, 0, 5, 1, 0, 1, 3, 2, 3, 0, 4, 3, 1,
2,
2, 0, 4, 2, 3, 1, 1, 2, 3, 4, 1, 1, 0, 0, 5,
2,
0, 3, 3, 3, 3, 4, 5, 0, 1, 5, 2, 1, 0, 1, 1,
2,
3, 2, 0, 5, 4, 5, 1, 0, 1, 4, 4, 3, 4, 5, 2,
0)
;
v ^= 255;
V w = v + g;
U u = ((union { V a; U b; }) w).b + ((union { V a; U b; }) w).b[1];
return u;
}
---
GCC 12 -march=skylake -O2 generates
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
vpcmpeqd %ymm4, %ymm4, %ymm4
movq %rsp, %rbp
.cfi_def_cfa_register 6
andq $-64, %rsp
subq $72, %rsp
movzbl g+2(%rip), %ecx
movzbl g+1(%rip), %edx
movzbl g(%rip), %eax
movzbl g+3(%rip), %edi
movzbl g+5(%rip), %esi
movzbl g+4(%rip), %r8d
vmovd %ecx, %xmm7
vmovd %edi, %xmm0
vmovd %edx, %xmm1
vmovd %eax, %xmm5
vmovdqa %xmm7, -72(%rsp)
vpinsrb $1, %eax, %xmm7, %xmm11
vmovd %r8d, %xmm2
vmovd %esi, %xmm7
vpinsrb $1, %edx, %xmm5, %xmm6
vpinsrb $1, %edi, %xmm2, %xmm14
vpinsrb $1, %ecx, %xmm1, %xmm12
vpinsrb $1, %ecx, %xmm0, %xmm15
vpinsrb $1, %edx, %xmm7, %xmm8
vpinsrb $1, %eax, %xmm0, %xmm3
vpunpcklwd %xmm11, %xmm6, %xmm13
vpunpcklwd %xmm12, %xmm14, %xmm9
vpunpcklwd %xmm6, %xmm8, %xmm8
vpunpcklwd %xmm3, %xmm15, %xmm3
vpunpckldq %xmm8, %xmm13, %xmm8
vpunpckldq %xmm9, %xmm3, %xmm3
vpunpcklqdq %xmm3, %xmm8, %xmm3
vpaddb g(%rip), %ymm4, %ymm10
vmovdqa %xmm14, -88(%rsp)
vmovdqa %xmm3, -104(%rsp)
vpinsrb $1, %r8d, %xmm0, %xmm9
vpinsrb $1, %ecx, %xmm7, %xmm4
vpinsrb $1, %ecx, %xmm2, %xmm3
vpinsrb $1, %edx, %xmm0, %xmm14
vpinsrb $1, %edx, %xmm1, %xmm8
vpinsrb $1, %eax, %xmm5, %xmm13
vpunpcklwd %xmm4, %xmm13, %xmm13
vpunpcklwd %xmm8, %xmm9, %xmm8
vpunpcklwd %xmm3, %xmm11, %xmm3
vpunpcklwd %xmm12, %xmm14, %xmm14
vmovdqa -104(%rsp), %xmm4
vpunpckldq %xmm13, %xmm8, %xmm8
vpunpckldq %xmm14, %xmm3, %xmm3
vpunpcklqdq %xmm8, %xmm3, %xmm3
vmovdqa -72(%rsp), %xmm13
vinserti128 $0x1, %xmm3, %ymm4, %ymm3
vpsubb %ymm3, %ymm10, %ymm10
vmovdqa %ymm10, -56(%rsp)
vmovdqa %xmm10, %xmm8
vpinsrb $1, %esi, %xmm1, %xmm3
vpinsrb $1, %edi, %xmm5, %xmm10
vpinsrb $1, %edi, %xmm0, %xmm0
vpinsrb $1, %eax, %xmm7, %xmm7
vpinsrb $1, %edx, %xmm13, %xmm13
vpunpcklwd %xmm13, %xmm3, %xmm3
vpunpcklwd %xmm0, %xmm10, %xmm0
vpunpcklwd %xmm7, %xmm9, %xmm9
vpunpcklwd %xmm12, %xmm6, %xmm6
vpunpckldq %xmm6, %xmm3, %xmm6
vpunpckldq %xmm9, %xmm0, %xmm0
vpunpcklqdq %xmm6, %xmm0, %xmm0
vpinsrb $1, %eax, %xmm1, %xmm6
vpinsrb $1, %r8d, %xmm1, %xmm1
vpunpcklwd -88(%rsp), %xmm1, %xmm1
vpinsrb $1, %esi, %xmm5, %xmm5
vpinsrb $1, %esi, %xmm2, %xmm2
vpunpcklwd %xmm5, %xmm15, %xmm3
vpunpcklwd %xmm6, %xmm2, %xmm5
vpunpcklwd %xmm11, %xmm2, %xmm2
vpcmpeqd %ymm4, %ymm4, %ymm4
vpunpckldq %xmm5, %xmm3, %xmm3
vpunpckldq %xmm2, %xmm1, %xmm1
vpaddb g+32(%rip), %ymm4, %ymm4
vpunpcklqdq %xmm1, %xmm3, %xmm1
vinserti128 $0x1, %xmm1, %ymm0, %ymm0
vpsubb %ymm0, %ymm4, %ymm4
vmovdqa %xmm8, 8(%rsp)
vmovdqa %ymm4, -24(%rsp)
vmovdqa -40(%rsp), %xmm4
vpbroadcastd 12(%rsp), %ymm0
vmovdqa %xmm4, 24(%rsp)
vpaddd 8(%rsp), %ymm0, %ymm0
leave
.cfi_def_cfa 7, 8
ret
clang 12 generates
foo: # @foo
.cfi_startproc
# %bb.0:
vmovdqa g(%rip), %ymm0
vpcmpeqd %ymm1, %ymm1, %ymm1
vpxor %ymm1, %ymm0, %ymm1
vpermq $68, %ymm1, %ymm1 # ymm1 = ymm1[0,1,0,1]
vpshufb .LCPI0_0(%rip), %ymm1, %ymm1 # ymm1 =
ymm1[0,1,2,0,5,1,0,1,3,
2,3,0,4,3,1,2,18,16,20,18,19,17,17,18,19,20,17,17,16,16,21,18]
vpaddb %ymm0, %ymm1, %ymm0
vpbroadcastd .LCPI0_1(%rip), %ymm1 # ymm1 = [1,1,1,1,1,1,1,1]
vpermd %ymm0, %ymm1, %ymm1
vpaddd %ymm0, %ymm1, %ymm0
retq
.Lfunc_end0:
.size foo, .Lfunc_end0-foo
.cfi_endproc