| Issue |
170665
|
| Summary |
[VectorCombine] Failure to merge duplicate new nodes
|
| Labels |
missed-optimization,
llvm::vectorcombine
|
| Assignees |
|
| Reporter |
RKSimon
|
Vectorcombine fails to completely concatenate these 2 x v4f32 chains
```ll
define <8 x float> @fmachain(<4 x float> %a0, <4 x float> %a1) {
%d0 = fadd <4 x float> %a0, splat (float -1.000000e+00)
%l0 = call <4 x float> @llvm.fma.v4f32(<4 x float> %d0, <4 x float> splat (float 0x3F8DE8D040000000), <4 x float> splat (float 0xBFB3715EE0000000))
%l1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %l0, <4 x float> %d0, <4 x float> splat (float 0x3FC82778A0000000))
%l2 = call <4 x float> @llvm.fma.v4f32(<4 x float> %l1, <4 x float> %d0, <4 x float> splat (float 0xBFD493F7E0000000))
%l3 = call <4 x float> @llvm.fma.v4f32(<4 x float> %l2, <4 x float> %d0, <4 x float> splat (float 0x3FDE311220000000))
%l4 = call <4 x float> @llvm.fma.v4f32(<4 x float> %l3, <4 x float> %d0, <4 x float> splat (float 0xBFE70BF2A0000000))
%l5 = call <4 x float> @llvm.fma.v4f32(<4 x float> %l4, <4 x float> %d0, <4 x float> splat (float 0x3FF71507C0000000))
%d1 = fadd <4 x float> %a1, splat (float -1.000000e+00)
%h0 = call <4 x float> @llvm.fma.v4f32(<4 x float> %d1, <4 x float> splat (float 0x3F8DE8D040000000), <4 x float> splat (float 0xBFB3715EE0000000))
%h1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %h0, <4 x float> %d1, <4 x float> splat (float 0x3FC82778A0000000))
%h2 = call <4 x float> @llvm.fma.v4f32(<4 x float> %h1, <4 x float> %d1, <4 x float> splat (float 0xBFD493F7E0000000))
%h3 = call <4 x float> @llvm.fma.v4f32(<4 x float> %h2, <4 x float> %d1, <4 x float> splat (float 0x3FDE311220000000))
%h4 = call <4 x float> @llvm.fma.v4f32(<4 x float> %h3, <4 x float> %d1, <4 x float> splat (float 0xBFE70BF2A0000000))
%h5 = call <4 x float> @llvm.fma.v4f32(<4 x float> %h4, <4 x float> %d1, <4 x float> splat (float 0x3FF71507C0000000))
%res = shufflevector <4 x float> %l5, <4 x float> %h5, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x float> %res
}
```
-passes=vector-combine -mtriple=x86_64-- -mcpu=x86-64-v3
```ll
define <8 x float> @fmachain(<4 x float> %a0, <4 x float> %a1) #0 {
%d0 = fadd <4 x float> %a0, splat (float -1.000000e+00)
%d1 = fadd <4 x float> %a1, splat (float -1.000000e+00)
%1 = shufflevector <4 x float> %d0, <4 x float> %d1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%2 = call <8 x float> @llvm.fma.v8f32(<8 x float> %1, <8 x float> splat (float 0x3F8DE8D040000000), <8 x float> splat (float 0xBFB3715EE0000000))
%3 = shufflevector <4 x float> %d0, <4 x float> %d1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%4 = call <8 x float> @llvm.fma.v8f32(<8 x float> %2, <8 x float> %3, <8 x float> splat (float 0x3FC82778A0000000))
%5 = shufflevector <4 x float> %d0, <4 x float> %d1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%6 = call <8 x float> @llvm.fma.v8f32(<8 x float> %4, <8 x float> %5, <8 x float> splat (float 0xBFD493F7E0000000))
%7 = shufflevector <4 x float> %d0, <4 x float> %d1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%8 = call <8 x float> @llvm.fma.v8f32(<8 x float> %6, <8 x float> %7, <8 x float> splat (float 0x3FDE311220000000))
%9 = shufflevector <4 x float> %d0, <4 x float> %d1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%10 = call <8 x float> @llvm.fma.v8f32(<8 x float> %8, <8 x float> %9, <8 x float> splat (float 0xBFE70BF2A0000000))
%11 = shufflevector <4 x float> %d0, <4 x float> %d1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%res = call <8 x float> @llvm.fma.v8f32(<8 x float> %10, <8 x float> %11, <8 x float> splat (float 0x3FF71507C0000000))
ret <8 x float> %res
}
```
Each time it merges an FMA intrinsic it creates yet another `shufflevector <4 x float> %d0, <4 x float> %d1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>` instruction instead of reusing any existing nodes it has previously created. The multiple uses of the %d and %d1 values prevent them being merged as needed:
```ll
define <8 x float> @fmachain(<4 x float> %a0, <4 x float> %a1) #0 {
%1 = shufflevector <4 x float> %a0, <4 x float> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%2 = fadd <8 x float> %1, splat (float -1.000000e+00)
%3 = call <8 x float> @llvm.fma.v8f32(<8 x float> %2, <8 x float> splat (float 0x3F8DE8D040000000), <8 x float> splat (float 0xBFB3715EE0000000))
%4 = call <8 x float> @llvm.fma.v8f32(<8 x float> %3, <8 x float> %3, <8 x float> splat (float 0x3FC82778A0000000))
%5 = call <8 x float> @llvm.fma.v8f32(<8 x float> %4, <8 x float> %5, <8 x float> splat (float 0xBFD493F7E0000000))
%6 = call <8 x float> @llvm.fma.v8f32(<8 x float> %5, <8 x float> %7, <8 x float> splat (float 0x3FDE311220000000))
%7 = call <8 x float> @llvm.fma.v8f32(<8 x float> %6, <8 x float> %9, <8 x float> splat (float 0xBFE70BF2A0000000))
%res = call <8 x float> @llvm.fma.v8f32(<8 x float> %10, <8 x float> %11, <8 x float> splat (float 0x3FF71507C0000000))
ret <8 x float> %res
}
```
We don't need to even up with a full implementation of EarlyCSE, but it would be good to have a basic numbering system paired with the InstructionWorkList to at least avoid repeatedly creating the same instruction over and over again.
_______________________________________________
llvm-bugs mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs