Issue |
81617
|
Summary |
[X86] Inconsistent bool splat codegen
|
Labels |
backend:X86,
missed-optimization
|
Assignees |
|
Reporter |
RKSimon
|
https://simd.godbolt.org/z/W8Kvdv5eM
For cases where we need to broadcast a scalar bool value to all vector lanes:
```c
__m128i boolv(bool accumulate) {
return accumulate ? _mm_set1_epi8(-1) : _mm_setzero_si128();
}
```
There are various approaches, all of which result in different codegen:
```ll
define <16 x i8> @boolv8(i1 zeroext %x) {
%r = select i1 %x, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <16 x i8> zeroinitializer
ret <16 x i8> %r
}
define <2 x i64> @boolv64(i1 zeroext %x) {
%r = select i1 %x, <2 x i64> <i64 -1, i64 -1>, <2 x i64> zeroinitializer
ret <2 x i64> %r
}
define <16 x i8> @vecv8(i1 zeroext %x) {
%v = insertelement <16 x i1> undef, i1 %x, i32 0
%s = shufflevector <16 x i1> %v, <16 x i1> undef, <16 x i32> zeroinitializer
%r = sext <16 x i1> %s to <16 x i8>
ret <16 x i8> %r
}
define <2 x i64> @vecv64(i1 zeroext %x) {
%v = insertelement <2 x i1> undef, i1 %x, i32 0
%s = shufflevector <2 x i1> %v, <2 x i1> undef, <2 x i32> zeroinitializer
%r = sext <2 x i1> %s to <2 x i64>
ret <2 x i64> %r
}
define <16 x i8> @bcstv8(i1 zeroext %x) {
%ext = sext i1 %x to i8
%v = insertelement <16 x i8> undef, i8 %ext, i32 0
%s = shufflevector <16 x i8> %v, <16 x i8> undef, <16 x i32> zeroinitializer
ret <16 x i8> %s
}
define <2 x i64> @bcstv64(i1 zeroext %x) {
%ext = sext i1 %x to i64
%v = insertelement <2 x i64> undef, i64 %ext, i32 0
%s = shufflevector <2 x i64> %v, <2 x i64> undef, <2 x i32> zeroinitializer
ret <2 x i64> %s
}
```
x86-64-v3:
```asm
boolv8: # @boolv8
vpcmpeqd %xmm0, %xmm0, %xmm0
testl %edi, %edi
jne .LBB0_2
vpxor %xmm0, %xmm0, %xmm0
.LBB0_2:
retq
boolv64: # @boolv64
vpcmpeqd %xmm0, %xmm0, %xmm0
testl %edi, %edi
jne .LBB1_2
vpxor %xmm0, %xmm0, %xmm0
.LBB1_2:
retq
vecv8: # @vecv8
vmovd %edi, %xmm0
vpbroadcastb %xmm0, %xmm0
vpsllw $7, %xmm0, %xmm0
vpxor %xmm1, %xmm1, %xmm1
vpcmpgtb %xmm0, %xmm1, %xmm0
retq
vecv64: # @vecv64
vmovd %edi, %xmm0
vpbroadcastd %xmm0, %xmm0
vpsllq $63, %xmm0, %xmm0
vpxor %xmm1, %xmm1, %xmm1
vpcmpgtq %xmm0, %xmm1, %xmm0
retq
bcstv8: # @bcstv8
negb %dil
vmovd %edi, %xmm0
vpbroadcastb %xmm0, %xmm0
retq
bcstv64: # @bcstv64
movl %edi, %eax
negq %rax
vmovq %rax, %xmm0
vpbroadcastq %xmm0, %xmm0
retq
```
The (branchless) broadcast approach is almost certainly the best approach, always for AVX2+ targets, and all SSE if we correctly broadcast as i32/i64 types
_______________________________________________
llvm-bugs mailing list
llvm-bugs@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs