https://gcc.gnu.org/bugzilla/show_bug.cgi?id=101579

            Bug ID: 101579
           Summary: Suboptimal codegen for __builtin_shufflevector
           Product: gcc
           Version: 12.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: target
          Assignee: unassigned at gcc dot gnu.org
          Reporter: hjl.tools at gmail dot com
                CC: crazylht at gmail dot com
  Target Milestone: ---
            Target: i386,x86-64

For

---
typedef unsigned int __attribute__((__vector_size__ (32))) U;
typedef unsigned char __attribute__((__vector_size__ (64))) V;

V g;

U
foo (void)
{
  V v = __builtin_shufflevector (g, g,
                                 0, 1, 2, 0, 5, 1, 0, 1, 3, 2, 3, 0, 4, 3, 1,
2,
                                 2, 0, 4, 2, 3, 1, 1, 2, 3, 4, 1, 1, 0, 0, 5,
2,
                                 0, 3, 3, 3, 3, 4, 5, 0, 1, 5, 2, 1, 0, 1, 1,
2,
                                 3, 2, 0, 5, 4, 5, 1, 0, 1, 4, 4, 3, 4, 5, 2,
0)
;
  v ^= 255;
  V w = v + g;
  U u = ((union { V a; U b; }) w).b + ((union { V a; U b; }) w).b[1];
  return u;
}
---

GCC 12 -march=skylake -O2 generates

        .cfi_startproc
        pushq   %rbp
        .cfi_def_cfa_offset 16
        .cfi_offset 6, -16
        vpcmpeqd        %ymm4, %ymm4, %ymm4
        movq    %rsp, %rbp
        .cfi_def_cfa_register 6
        andq    $-64, %rsp
        subq    $72, %rsp
        movzbl  g+2(%rip), %ecx
        movzbl  g+1(%rip), %edx
        movzbl  g(%rip), %eax
        movzbl  g+3(%rip), %edi
        movzbl  g+5(%rip), %esi
        movzbl  g+4(%rip), %r8d
        vmovd   %ecx, %xmm7
        vmovd   %edi, %xmm0
        vmovd   %edx, %xmm1
        vmovd   %eax, %xmm5
        vmovdqa %xmm7, -72(%rsp)
        vpinsrb $1, %eax, %xmm7, %xmm11
        vmovd   %r8d, %xmm2
        vmovd   %esi, %xmm7
        vpinsrb $1, %edx, %xmm5, %xmm6
        vpinsrb $1, %edi, %xmm2, %xmm14
        vpinsrb $1, %ecx, %xmm1, %xmm12
        vpinsrb $1, %ecx, %xmm0, %xmm15
        vpinsrb $1, %edx, %xmm7, %xmm8
        vpinsrb $1, %eax, %xmm0, %xmm3
        vpunpcklwd      %xmm11, %xmm6, %xmm13
        vpunpcklwd      %xmm12, %xmm14, %xmm9
        vpunpcklwd      %xmm6, %xmm8, %xmm8
        vpunpcklwd      %xmm3, %xmm15, %xmm3
        vpunpckldq      %xmm8, %xmm13, %xmm8
        vpunpckldq      %xmm9, %xmm3, %xmm3
        vpunpcklqdq     %xmm3, %xmm8, %xmm3
        vpaddb  g(%rip), %ymm4, %ymm10
        vmovdqa %xmm14, -88(%rsp)
        vmovdqa %xmm3, -104(%rsp)
        vpinsrb $1, %r8d, %xmm0, %xmm9
        vpinsrb $1, %ecx, %xmm7, %xmm4
        vpinsrb $1, %ecx, %xmm2, %xmm3
        vpinsrb $1, %edx, %xmm0, %xmm14
        vpinsrb $1, %edx, %xmm1, %xmm8
        vpinsrb $1, %eax, %xmm5, %xmm13
        vpunpcklwd      %xmm4, %xmm13, %xmm13
        vpunpcklwd      %xmm8, %xmm9, %xmm8
        vpunpcklwd      %xmm3, %xmm11, %xmm3
        vpunpcklwd      %xmm12, %xmm14, %xmm14
        vmovdqa -104(%rsp), %xmm4
        vpunpckldq      %xmm13, %xmm8, %xmm8
        vpunpckldq      %xmm14, %xmm3, %xmm3
        vpunpcklqdq     %xmm8, %xmm3, %xmm3
        vmovdqa -72(%rsp), %xmm13
        vinserti128     $0x1, %xmm3, %ymm4, %ymm3
        vpsubb  %ymm3, %ymm10, %ymm10
        vmovdqa %ymm10, -56(%rsp)
        vmovdqa %xmm10, %xmm8
        vpinsrb $1, %esi, %xmm1, %xmm3
        vpinsrb $1, %edi, %xmm5, %xmm10
        vpinsrb $1, %edi, %xmm0, %xmm0
        vpinsrb $1, %eax, %xmm7, %xmm7
        vpinsrb $1, %edx, %xmm13, %xmm13
        vpunpcklwd      %xmm13, %xmm3, %xmm3
        vpunpcklwd      %xmm0, %xmm10, %xmm0
        vpunpcklwd      %xmm7, %xmm9, %xmm9
        vpunpcklwd      %xmm12, %xmm6, %xmm6
        vpunpckldq      %xmm6, %xmm3, %xmm6
        vpunpckldq      %xmm9, %xmm0, %xmm0
        vpunpcklqdq     %xmm6, %xmm0, %xmm0
        vpinsrb $1, %eax, %xmm1, %xmm6
        vpinsrb $1, %r8d, %xmm1, %xmm1
        vpunpcklwd      -88(%rsp), %xmm1, %xmm1
        vpinsrb $1, %esi, %xmm5, %xmm5
        vpinsrb $1, %esi, %xmm2, %xmm2
        vpunpcklwd      %xmm5, %xmm15, %xmm3
        vpunpcklwd      %xmm6, %xmm2, %xmm5
        vpunpcklwd      %xmm11, %xmm2, %xmm2
        vpcmpeqd        %ymm4, %ymm4, %ymm4
        vpunpckldq      %xmm5, %xmm3, %xmm3
        vpunpckldq      %xmm2, %xmm1, %xmm1
        vpaddb  g+32(%rip), %ymm4, %ymm4
        vpunpcklqdq     %xmm1, %xmm3, %xmm1
        vinserti128     $0x1, %xmm1, %ymm0, %ymm0
        vpsubb  %ymm0, %ymm4, %ymm4
        vmovdqa %xmm8, 8(%rsp)
        vmovdqa %ymm4, -24(%rsp)
        vmovdqa -40(%rsp), %xmm4
        vpbroadcastd    12(%rsp), %ymm0
        vmovdqa %xmm4, 24(%rsp)
        vpaddd  8(%rsp), %ymm0, %ymm0
        leave
        .cfi_def_cfa 7, 8
        ret

clang 12 generates

foo:                                    # @foo
        .cfi_startproc
# %bb.0:
        vmovdqa g(%rip), %ymm0
        vpcmpeqd        %ymm1, %ymm1, %ymm1
        vpxor   %ymm1, %ymm0, %ymm1
        vpermq  $68, %ymm1, %ymm1               # ymm1 = ymm1[0,1,0,1]
        vpshufb .LCPI0_0(%rip), %ymm1, %ymm1    # ymm1 =
ymm1[0,1,2,0,5,1,0,1,3,
2,3,0,4,3,1,2,18,16,20,18,19,17,17,18,19,20,17,17,16,16,21,18]
        vpaddb  %ymm0, %ymm1, %ymm0
        vpbroadcastd    .LCPI0_1(%rip), %ymm1   # ymm1 = [1,1,1,1,1,1,1,1]
        vpermd  %ymm0, %ymm1, %ymm1
        vpaddd  %ymm0, %ymm1, %ymm0
        retq
.Lfunc_end0:
        .size   foo, .Lfunc_end0-foo
        .cfi_endproc

Reply via email to