Issue |
148238
|
Summary |
Bit-reversal cogen broken on x86 systems with `gfni` but no `avx`
|
Labels |
new issue
|
Assignees |
|
Reporter |
TDecking
|
<details>
<summary>LLVM</summary>
```llvm
define noundef i64 @rev_u64_0(i64 noundef %s) unnamed_addr #0 {
start:
%0 = tail call i64 @llvm.bitreverse.i64(i64 %s)
ret i64 %0
}
define noundef i64 @rev_u64_1(i64 noundef %s) unnamed_addr #1 {
start:
%0 = tail call noundef i64 @llvm.bitreverse.i64(i64 %s)
ret i64 %0
}
define noundef i64 @rev_u64_2(i64 noundef %s) unnamed_addr #2 {
start:
%0 = tail call noundef i64 @llvm.bitreverse.i64(i64 %s)
ret i64 %0
}
define noundef i64 @rev_u64_manual(i64 noundef %s) unnamed_addr #1 {
start:
%.sroa.0.8.vec.insert.i = insertelement <2 x i64> <i64 poison, i64 0>, i64 %s, i64 0
%0 = bitcast <2 x i64> %.sroa.0.8.vec.insert.i to <16 x i8>
%r = tail call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> %0, <16 x i8> <i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64, i8 -128, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64, i8 -128>, i8 noundef zeroext 0) #5
%1 = bitcast <16 x i8> %r to <2 x i64>
%.sroa.010.0.vec.extract = extractelement <2 x i64> %1, i64 0
%2 = tail call noundef i64 @llvm.bswap.i64(i64 %.sroa.010.0.vec.extract)
ret i64 %2
}
declare i64 @llvm.bswap.i64(i64) #3
declare i64 @llvm.bitreverse.i64(i64) #3
declare <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8>, <16 x i8>, i8 immarg) unnamed_addr #4
attributes #0 = { mustprogress nofree norecurse nosync nounwind nonlazybind willreturn memory(none) uwtable "probe-stack"="inline-asm" "target-cpu"="x86-64" }
attributes #1 = { mustprogress nofree norecurse nosync nounwind nonlazybind willreturn memory(none) uwtable "probe-stack"="inline-asm" "target-cpu"="x86-64" "target-features"="+gfni,+sse,+sse2" }
attributes #2 = { mustprogress nofree norecurse nosync nounwind nonlazybind willreturn memory(none) uwtable "probe-stack"="inline-asm" "target-cpu"="x86-64" "target-features"="+gfni,+sse,+sse2,+avx,+avx2,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+crc32,+ssse3" }
attributes #3 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
attributes #4 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
attributes #5 = { nounwind }
```
</details>
<details>
<summary>Generated assembly</summary>
```asm
rev_u64_0: # @rev_u64_0
bswap rdi
mov rax, rdi
shr rax, 4
movabs rcx, 1085102592571150095
and rax, rcx
and rdi, rcx
shl rdi, 4
or rdi, rax
movabs rax, 3689348814741910323
mov rcx, rdi
and rcx, rax
shr rdi, 2
and rdi, rax
lea rax, [rdi + 4*rcx]
movabs rcx, 6148914691236517205
mov rdx, rax
and rdx, rcx
shr rax
and rax, rcx
lea rax, [rax + 2*rdx]
ret
.LCPI1_0:
.byte 0 # 0x0
.byte 0 # 0x0
.byte 0 # 0x0
.byte 0 # 0x0
.byte 128 # 0x80
.byte 64 # 0x40
.byte 32 # 0x20
.byte 16 # 0x10
.byte 0 # 0x0
.byte 0 # 0x0
.byte 0 # 0x0
.byte 0 # 0x0
.byte 128 # 0x80
.byte 64 # 0x40
.byte 32 # 0x20
.byte 16 # 0x10
.LCPI1_1:
.zero 16,15
.LCPI1_2:
.byte 8 # 0x8
.byte 4 # 0x4
.byte 2 # 0x2
.byte 1 # 0x1
.byte 0 # 0x0
.byte 0 # 0x0
.byte 0 # 0x0
.byte 0 # 0x0
.byte 8 # 0x8
.byte 4 # 0x4
.byte 2 # 0x2
.byte 1 # 0x1
.byte 0 # 0x0
.byte 0 # 0x0
.byte 0 # 0x0
.byte 0 # 0x0
.LCPI1_3:
.byte 0 # 0x0
.byte 0 # 0x0
.byte 128 # 0x80
.byte 64 # 0x40
.byte 32 # 0x20
.byte 16 # 0x10
.byte 8 # 0x8
.byte 4 # 0x4
.byte 0 # 0x0
.byte 0 # 0x0
.byte 128 # 0x80
.byte 64 # 0x40
.byte 32 # 0x20
.byte 16 # 0x10
.byte 8 # 0x8
.byte 4 # 0x4
.LCPI1_4:
.zero 16,51
.LCPI1_5:
.byte 32 # 0x20
.byte 16 # 0x10
.byte 8 # 0x8
.byte 4 # 0x4
.byte 2 # 0x2
.byte 1 # 0x1
.byte 0 # 0x0
.byte 0 # 0x0
.byte 32 # 0x20
.byte 16 # 0x10
.byte 8 # 0x8
.byte 4 # 0x4
.byte 2 # 0x2
.byte 1 # 0x1
.byte 0 # 0x0
.byte 0 # 0x0
.LCPI1_6:
.byte 0 # 0x0
.byte 128 # 0x80
.byte 64 # 0x40
.byte 32 # 0x20
.byte 16 # 0x10
.byte 8 # 0x8
.byte 4 # 0x4
.byte 2 # 0x2
.byte 0 # 0x0
.byte 128 # 0x80
.byte 64 # 0x40
.byte 32 # 0x20
.byte 16 # 0x10
.byte 8 # 0x8
.byte 4 # 0x4
.byte 2 # 0x2
.LCPI1_7:
.zero 16,85
rev_u64_1: # @rev_u64_1
movq xmm0, rdi
movdqa xmm1, xmm0
gf2p8affineqb xmm1, xmmword ptr [rip + .LCPI1_0], 0
movdqa xmm2, xmmword ptr [rip + .LCPI1_1] # xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
pand xmm1, xmm2
pand xmm0, xmm2
gf2p8affineqb xmm0, xmmword ptr [rip + .LCPI1_2], 0
por xmm0, xmm1
movdqa xmm1, xmm0
gf2p8affineqb xmm1, xmmword ptr [rip + .LCPI1_3], 0
movdqa xmm2, xmmword ptr [rip + .LCPI1_4] # xmm2 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
pand xmm1, xmm2
pand xmm0, xmm2
gf2p8affineqb xmm0, xmmword ptr [rip + .LCPI1_5], 0
por xmm0, xmm1
movdqa xmm1, xmm0
gf2p8affineqb xmm1, xmmword ptr [rip + .LCPI1_6], 0
movdqa xmm2, xmmword ptr [rip + .LCPI1_7] # xmm2 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
pand xmm1, xmm2
pand xmm0, xmm2
paddb xmm0, xmm0
por xmm0, xmm1
movq rax, xmm0
bswap rax
ret
.LCPI2_0:
.byte 1 # 0x1
.byte 2 # 0x2
.byte 4 # 0x4
.byte 8 # 0x8
.byte 16 # 0x10
.byte 32 # 0x20
.byte 64 # 0x40
.byte 128 # 0x80
.byte 1 # 0x1
.byte 2 # 0x2
.byte 4 # 0x4
.byte 8 # 0x8
.byte 16 # 0x10
.byte 32 # 0x20
.byte 64 # 0x40
.byte 128 # 0x80
rev_u64_2: # @rev_u64_2
vmovq xmm0, rdi
vgf2p8affineqb xmm0, xmm0, xmmword ptr [rip + .LCPI2_0], 0
vmovq rax, xmm0
bswap rax
ret
.LCPI3_0:
.byte 1 # 0x1
.byte 2 # 0x2
.byte 4 # 0x4
.byte 8 # 0x8
.byte 16 # 0x10
.byte 32 # 0x20
.byte 64 # 0x40
.byte 128 # 0x80
.byte 1 # 0x1
.byte 2 # 0x2
.byte 4 # 0x4
.byte 8 # 0x8
.byte 16 # 0x10
.byte 32 # 0x20
.byte 64 # 0x40
.byte 128 # 0x80
rev_u64_manual: # @rev_u64_manual
movq xmm0, rdi
gf2p8affineqb xmm0, xmmword ptr [rip + .LCPI3_0], 0
movq rax, xmm0
bswap rax
ret
```
</details>
_______________________________________________
llvm-bugs mailing list
llvm-bugs@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs