Issue 81617
Summary [X86] Inconsistent bool splat codegen
Labels backend:X86, missed-optimization
Assignees
Reporter RKSimon
    https://simd.godbolt.org/z/W8Kvdv5eM

For cases where we need to broadcast a scalar bool value to all vector lanes:
```c
__m128i boolv(bool accumulate) {
  return accumulate ? _mm_set1_epi8(-1) : _mm_setzero_si128();
}
```
There are various approaches, all of which result in different codegen:
```ll
define <16 x i8> @boolv8(i1 zeroext %x) {
  %r = select i1 %x, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <16 x i8> zeroinitializer
  ret <16 x i8> %r
}
define <2 x i64> @boolv64(i1 zeroext %x) {
  %r = select i1 %x, <2 x i64> <i64 -1, i64 -1>, <2 x i64> zeroinitializer
  ret <2 x i64> %r
}
define <16 x i8> @vecv8(i1 zeroext %x) {
  %v = insertelement <16 x i1> undef, i1 %x, i32 0
  %s = shufflevector <16 x i1> %v, <16 x i1> undef, <16 x i32> zeroinitializer
  %r = sext <16 x i1> %s to <16 x i8>
  ret <16 x i8> %r
}
define <2 x i64> @vecv64(i1 zeroext %x) {
  %v = insertelement <2 x i1> undef, i1 %x, i32 0
  %s = shufflevector <2 x i1> %v, <2 x i1> undef, <2 x i32> zeroinitializer
  %r = sext <2 x i1> %s to <2 x i64>
  ret <2 x i64> %r
}
define <16 x i8> @bcstv8(i1 zeroext %x) {
  %ext = sext i1 %x to i8
  %v = insertelement <16 x i8> undef, i8 %ext, i32 0
  %s = shufflevector <16 x i8> %v, <16 x i8> undef, <16 x i32> zeroinitializer
  ret <16 x i8> %s
}
define <2 x i64> @bcstv64(i1 zeroext %x) {
  %ext = sext i1 %x to i64
  %v = insertelement <2 x i64> undef, i64 %ext, i32 0
  %s = shufflevector <2 x i64> %v, <2 x i64> undef, <2 x i32> zeroinitializer
 ret <2 x i64> %s
}
```
x86-64-v3:
```asm
boolv8: # @boolv8
        vpcmpeqd        %xmm0, %xmm0, %xmm0
        testl   %edi, %edi
        jne     .LBB0_2
 vpxor   %xmm0, %xmm0, %xmm0
.LBB0_2:
        retq
boolv64: # @boolv64
        vpcmpeqd        %xmm0, %xmm0, %xmm0
        testl   %edi, %edi
        jne     .LBB1_2
 vpxor   %xmm0, %xmm0, %xmm0
.LBB1_2:
        retq
vecv8: # @vecv8
        vmovd   %edi, %xmm0
 vpbroadcastb    %xmm0, %xmm0
        vpsllw  $7, %xmm0, %xmm0
 vpxor   %xmm1, %xmm1, %xmm1
        vpcmpgtb        %xmm0, %xmm1, %xmm0
        retq
vecv64:                                 # @vecv64
        vmovd   %edi, %xmm0
        vpbroadcastd    %xmm0, %xmm0
        vpsllq  $63, %xmm0, %xmm0
        vpxor   %xmm1, %xmm1, %xmm1
        vpcmpgtq        %xmm0, %xmm1, %xmm0
 retq
bcstv8:                                 # @bcstv8
        negb %dil
        vmovd   %edi, %xmm0
        vpbroadcastb    %xmm0, %xmm0
        retq
bcstv64:                                # @bcstv64
        movl    %edi, %eax
        negq    %rax
 vmovq   %rax, %xmm0
        vpbroadcastq    %xmm0, %xmm0
 retq
```
The (branchless) broadcast approach is almost certainly the best approach, always for AVX2+ targets, and all SSE if we correctly broadcast as i32/i64 types
_______________________________________________
llvm-bugs mailing list
llvm-bugs@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs

Reply via email to