https://gcc.gnu.org/bugzilla/show_bug.cgi?id=101579
Bug ID: 101579 Summary: Suboptimal codegen for __builtin_shufflevector Product: gcc Version: 12.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: target Assignee: unassigned at gcc dot gnu.org Reporter: hjl.tools at gmail dot com CC: crazylht at gmail dot com Target Milestone: --- Target: i386,x86-64 For --- typedef unsigned int __attribute__((__vector_size__ (32))) U; typedef unsigned char __attribute__((__vector_size__ (64))) V; V g; U foo (void) { V v = __builtin_shufflevector (g, g, 0, 1, 2, 0, 5, 1, 0, 1, 3, 2, 3, 0, 4, 3, 1, 2, 2, 0, 4, 2, 3, 1, 1, 2, 3, 4, 1, 1, 0, 0, 5, 2, 0, 3, 3, 3, 3, 4, 5, 0, 1, 5, 2, 1, 0, 1, 1, 2, 3, 2, 0, 5, 4, 5, 1, 0, 1, 4, 4, 3, 4, 5, 2, 0) ; v ^= 255; V w = v + g; U u = ((union { V a; U b; }) w).b + ((union { V a; U b; }) w).b[1]; return u; } --- GCC 12 -march=skylake -O2 generates .cfi_startproc pushq %rbp .cfi_def_cfa_offset 16 .cfi_offset 6, -16 vpcmpeqd %ymm4, %ymm4, %ymm4 movq %rsp, %rbp .cfi_def_cfa_register 6 andq $-64, %rsp subq $72, %rsp movzbl g+2(%rip), %ecx movzbl g+1(%rip), %edx movzbl g(%rip), %eax movzbl g+3(%rip), %edi movzbl g+5(%rip), %esi movzbl g+4(%rip), %r8d vmovd %ecx, %xmm7 vmovd %edi, %xmm0 vmovd %edx, %xmm1 vmovd %eax, %xmm5 vmovdqa %xmm7, -72(%rsp) vpinsrb $1, %eax, %xmm7, %xmm11 vmovd %r8d, %xmm2 vmovd %esi, %xmm7 vpinsrb $1, %edx, %xmm5, %xmm6 vpinsrb $1, %edi, %xmm2, %xmm14 vpinsrb $1, %ecx, %xmm1, %xmm12 vpinsrb $1, %ecx, %xmm0, %xmm15 vpinsrb $1, %edx, %xmm7, %xmm8 vpinsrb $1, %eax, %xmm0, %xmm3 vpunpcklwd %xmm11, %xmm6, %xmm13 vpunpcklwd %xmm12, %xmm14, %xmm9 vpunpcklwd %xmm6, %xmm8, %xmm8 vpunpcklwd %xmm3, %xmm15, %xmm3 vpunpckldq %xmm8, %xmm13, %xmm8 vpunpckldq %xmm9, %xmm3, %xmm3 vpunpcklqdq %xmm3, %xmm8, %xmm3 vpaddb g(%rip), %ymm4, %ymm10 vmovdqa %xmm14, -88(%rsp) vmovdqa %xmm3, -104(%rsp) vpinsrb $1, %r8d, %xmm0, %xmm9 vpinsrb $1, %ecx, %xmm7, %xmm4 vpinsrb $1, %ecx, %xmm2, %xmm3 vpinsrb $1, %edx, %xmm0, %xmm14 vpinsrb $1, %edx, %xmm1, %xmm8 vpinsrb $1, %eax, %xmm5, %xmm13 vpunpcklwd %xmm4, %xmm13, %xmm13 vpunpcklwd %xmm8, %xmm9, %xmm8 vpunpcklwd %xmm3, %xmm11, %xmm3 vpunpcklwd %xmm12, %xmm14, %xmm14 vmovdqa -104(%rsp), %xmm4 vpunpckldq %xmm13, %xmm8, %xmm8 vpunpckldq %xmm14, %xmm3, %xmm3 vpunpcklqdq %xmm8, %xmm3, %xmm3 vmovdqa -72(%rsp), %xmm13 vinserti128 $0x1, %xmm3, %ymm4, %ymm3 vpsubb %ymm3, %ymm10, %ymm10 vmovdqa %ymm10, -56(%rsp) vmovdqa %xmm10, %xmm8 vpinsrb $1, %esi, %xmm1, %xmm3 vpinsrb $1, %edi, %xmm5, %xmm10 vpinsrb $1, %edi, %xmm0, %xmm0 vpinsrb $1, %eax, %xmm7, %xmm7 vpinsrb $1, %edx, %xmm13, %xmm13 vpunpcklwd %xmm13, %xmm3, %xmm3 vpunpcklwd %xmm0, %xmm10, %xmm0 vpunpcklwd %xmm7, %xmm9, %xmm9 vpunpcklwd %xmm12, %xmm6, %xmm6 vpunpckldq %xmm6, %xmm3, %xmm6 vpunpckldq %xmm9, %xmm0, %xmm0 vpunpcklqdq %xmm6, %xmm0, %xmm0 vpinsrb $1, %eax, %xmm1, %xmm6 vpinsrb $1, %r8d, %xmm1, %xmm1 vpunpcklwd -88(%rsp), %xmm1, %xmm1 vpinsrb $1, %esi, %xmm5, %xmm5 vpinsrb $1, %esi, %xmm2, %xmm2 vpunpcklwd %xmm5, %xmm15, %xmm3 vpunpcklwd %xmm6, %xmm2, %xmm5 vpunpcklwd %xmm11, %xmm2, %xmm2 vpcmpeqd %ymm4, %ymm4, %ymm4 vpunpckldq %xmm5, %xmm3, %xmm3 vpunpckldq %xmm2, %xmm1, %xmm1 vpaddb g+32(%rip), %ymm4, %ymm4 vpunpcklqdq %xmm1, %xmm3, %xmm1 vinserti128 $0x1, %xmm1, %ymm0, %ymm0 vpsubb %ymm0, %ymm4, %ymm4 vmovdqa %xmm8, 8(%rsp) vmovdqa %ymm4, -24(%rsp) vmovdqa -40(%rsp), %xmm4 vpbroadcastd 12(%rsp), %ymm0 vmovdqa %xmm4, 24(%rsp) vpaddd 8(%rsp), %ymm0, %ymm0 leave .cfi_def_cfa 7, 8 ret clang 12 generates foo: # @foo .cfi_startproc # %bb.0: vmovdqa g(%rip), %ymm0 vpcmpeqd %ymm1, %ymm1, %ymm1 vpxor %ymm1, %ymm0, %ymm1 vpermq $68, %ymm1, %ymm1 # ymm1 = ymm1[0,1,0,1] vpshufb .LCPI0_0(%rip), %ymm1, %ymm1 # ymm1 = ymm1[0,1,2,0,5,1,0,1,3, 2,3,0,4,3,1,2,18,16,20,18,19,17,17,18,19,20,17,17,16,16,21,18] vpaddb %ymm0, %ymm1, %ymm0 vpbroadcastd .LCPI0_1(%rip), %ymm1 # ymm1 = [1,1,1,1,1,1,1,1] vpermd %ymm0, %ymm1, %ymm1 vpaddd %ymm0, %ymm1, %ymm0 retq .Lfunc_end0: .size foo, .Lfunc_end0-foo .cfi_endproc