Issue 148238
Summary Bit-reversal cogen broken on x86 systems with `gfni` but no `avx`
Labels new issue
Assignees
Reporter TDecking
    <details>
<summary>LLVM</summary>

```llvm
define noundef i64 @rev_u64_0(i64 noundef %s) unnamed_addr #0 {
start:
  %0 = tail call i64 @llvm.bitreverse.i64(i64 %s)
  ret i64 %0
}

define noundef i64 @rev_u64_1(i64 noundef %s) unnamed_addr #1 {
start:
  %0 = tail call noundef i64 @llvm.bitreverse.i64(i64 %s)
  ret i64 %0
}

define noundef i64 @rev_u64_2(i64 noundef %s) unnamed_addr #2 {
start:
  %0 = tail call noundef i64 @llvm.bitreverse.i64(i64 %s)
  ret i64 %0
}

define noundef i64 @rev_u64_manual(i64 noundef %s) unnamed_addr #1 {
start:
 %.sroa.0.8.vec.insert.i = insertelement <2 x i64> <i64 poison, i64 0>, i64 %s, i64 0
  %0 = bitcast <2 x i64> %.sroa.0.8.vec.insert.i to <16 x i8>
 %r = tail call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> %0, <16 x i8> <i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64, i8 -128, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64, i8 -128>, i8 noundef zeroext 0) #5
  %1 = bitcast <16 x i8> %r to <2 x i64>
  %.sroa.010.0.vec.extract = extractelement <2 x i64> %1, i64 0
  %2 = tail call noundef i64 @llvm.bswap.i64(i64 %.sroa.010.0.vec.extract)
  ret i64 %2
}

declare i64 @llvm.bswap.i64(i64) #3
declare i64 @llvm.bitreverse.i64(i64) #3
declare <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8>, <16 x i8>, i8 immarg) unnamed_addr #4

attributes #0 = { mustprogress nofree norecurse nosync nounwind nonlazybind willreturn memory(none) uwtable "probe-stack"="inline-asm" "target-cpu"="x86-64" }
attributes #1 = { mustprogress nofree norecurse nosync nounwind nonlazybind willreturn memory(none) uwtable "probe-stack"="inline-asm" "target-cpu"="x86-64" "target-features"="+gfni,+sse,+sse2" }
attributes #2 = { mustprogress nofree norecurse nosync nounwind nonlazybind willreturn memory(none) uwtable "probe-stack"="inline-asm" "target-cpu"="x86-64" "target-features"="+gfni,+sse,+sse2,+avx,+avx2,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+crc32,+ssse3" }
attributes #3 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
attributes #4 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
attributes #5 = { nounwind }
```
</details>

<details>
<summary>Generated assembly</summary>


```asm
rev_u64_0:                              # @rev_u64_0
        bswap   rdi
        mov     rax, rdi
        shr rax, 4
        movabs  rcx, 1085102592571150095
        and     rax, rcx
 and     rdi, rcx
        shl     rdi, 4
        or      rdi, rax
 movabs  rax, 3689348814741910323
        mov     rcx, rdi
        and rcx, rax
        shr     rdi, 2
        and     rdi, rax
        lea rax, [rdi + 4*rcx]
        movabs  rcx, 6148914691236517205
        mov rdx, rax
        and     rdx, rcx
        shr     rax
        and     rax, rcx
        lea     rax, [rax + 2*rdx]
        ret
.LCPI1_0:
 .byte   0                               # 0x0
        .byte   0 # 0x0
        .byte   0                               # 0x0
        .byte   0                               # 0x0
        .byte 128                             # 0x80
        .byte   64 # 0x40
        .byte   32                              # 0x20
 .byte   16                              # 0x10
        .byte   0 # 0x0
        .byte   0 # 0x0
        .byte   0                               # 0x0
        .byte 0                               # 0x0
        .byte   128 # 0x80
        .byte   64                              # 0x40
 .byte   32                              # 0x20
        .byte   16 # 0x10
.LCPI1_1:
        .zero   16,15
.LCPI1_2:
 .byte   8                               # 0x8
        .byte   4 # 0x4
        .byte   2                               # 0x2
        .byte   1                               # 0x1
        .byte   0 # 0x0
        .byte   0 # 0x0
        .byte   0                               # 0x0
 .byte   0                               # 0x0
        .byte   8 # 0x8
        .byte   4                               # 0x4
        .byte   2                               # 0x2
        .byte   1 # 0x1
        .byte   0 # 0x0
        .byte   0                               # 0x0
 .byte   0                               # 0x0
        .byte   0 # 0x0
.LCPI1_3:
        .byte   0 # 0x0
        .byte   0                               # 0x0
 .byte   128                             # 0x80
        .byte   64 # 0x40
        .byte   32                              # 0x20
        .byte   16                              # 0x10
        .byte 8                               # 0x8
        .byte   4 # 0x4
        .byte   0                               # 0x0
 .byte   0                               # 0x0
        .byte   128 # 0x80
        .byte   64                              # 0x40
        .byte   32                              # 0x20
        .byte 16                              # 0x10
        .byte   8 # 0x8
        .byte   4                               # 0x4
.LCPI1_4:
        .zero   16,51
.LCPI1_5:
        .byte   32 # 0x20
        .byte   16                              # 0x10
        .byte   8                               # 0x8
        .byte 4                               # 0x4
        .byte   2 # 0x2
        .byte   1                               # 0x1
 .byte   0                               # 0x0
        .byte   0 # 0x0
        .byte   32                              # 0x20
        .byte   16                              # 0x10
        .byte 8                               # 0x8
        .byte   4 # 0x4
        .byte   2                               # 0x2
 .byte   1                               # 0x1
        .byte   0 # 0x0
        .byte   0                               # 0x0
.LCPI1_6:
        .byte   0                               # 0x0
 .byte   128                             # 0x80
        .byte   64 # 0x40
        .byte   32                              # 0x20
        .byte   16                              # 0x10
        .byte 8                               # 0x8
        .byte   4 # 0x4
        .byte   2                               # 0x2
 .byte   0                               # 0x0
        .byte   128 # 0x80
        .byte   64                              # 0x40
        .byte   32                              # 0x20
        .byte 16                              # 0x10
        .byte   8 # 0x8
        .byte   4                               # 0x4
 .byte   2                               # 0x2
.LCPI1_7:
        .zero 16,85
rev_u64_1:                              # @rev_u64_1
        movq xmm0, rdi
        movdqa  xmm1, xmm0
        gf2p8affineqb   xmm1, xmmword ptr [rip + .LCPI1_0], 0
        movdqa  xmm2, xmmword ptr [rip + .LCPI1_1] # xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
        pand xmm1, xmm2
        pand    xmm0, xmm2
        gf2p8affineqb   xmm0, xmmword ptr [rip + .LCPI1_2], 0
        por     xmm0, xmm1
        movdqa  xmm1, xmm0
        gf2p8affineqb   xmm1, xmmword ptr [rip + .LCPI1_3], 0
 movdqa  xmm2, xmmword ptr [rip + .LCPI1_4] # xmm2 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
        pand    xmm1, xmm2
        pand    xmm0, xmm2
        gf2p8affineqb   xmm0, xmmword ptr [rip + .LCPI1_5], 0
        por     xmm0, xmm1
        movdqa  xmm1, xmm0
 gf2p8affineqb   xmm1, xmmword ptr [rip + .LCPI1_6], 0
        movdqa xmm2, xmmword ptr [rip + .LCPI1_7] # xmm2 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
        pand    xmm1, xmm2
        pand    xmm0, xmm2
        paddb   xmm0, xmm0
        por xmm0, xmm1
        movq    rax, xmm0
        bswap   rax
 ret
.LCPI2_0:
        .byte   1                               # 0x1
 .byte   2                               # 0x2
        .byte   4 # 0x4
        .byte   8                               # 0x8
        .byte   16                              # 0x10
        .byte 32                              # 0x20
        .byte   64 # 0x40
        .byte   128                             # 0x80
 .byte   1                               # 0x1
        .byte   2 # 0x2
        .byte   4                               # 0x4
        .byte   8                               # 0x8
        .byte 16                              # 0x10
        .byte   32 # 0x20
        .byte   64                              # 0x40
 .byte   128                             # 0x80
rev_u64_2: # @rev_u64_2
        vmovq   xmm0, rdi
        vgf2p8affineqb xmm0, xmm0, xmmword ptr [rip + .LCPI2_0], 0
        vmovq   rax, xmm0
 bswap   rax
        ret
.LCPI3_0:
        .byte   1 # 0x1
        .byte   2                               # 0x2
 .byte   4                               # 0x4
        .byte   8 # 0x8
        .byte   16                              # 0x10
        .byte   32                              # 0x20
        .byte 64                              # 0x40
        .byte   128 # 0x80
        .byte   1                               # 0x1
 .byte   2                               # 0x2
        .byte   4 # 0x4
        .byte   8                               # 0x8
        .byte   16                              # 0x10
        .byte 32                              # 0x20
        .byte   64 # 0x40
        .byte   128                             # 0x80
rev_u64_manual:                         # @rev_u64_manual
        movq xmm0, rdi
        gf2p8affineqb   xmm0, xmmword ptr [rip + .LCPI3_0], 0
 movq    rax, xmm0
        bswap   rax
        ret
```
</details>
_______________________________________________
llvm-bugs mailing list
llvm-bugs@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs

Reply via email to