Issue 170665
Summary [VectorCombine] Failure to merge duplicate new nodes
Labels missed-optimization, llvm::vectorcombine
Assignees
Reporter RKSimon
    Vectorcombine fails to completely concatenate these 2 x v4f32 chains
```ll
define <8 x float> @fmachain(<4 x float> %a0, <4 x float> %a1) {
  %d0 = fadd <4 x float> %a0, splat (float -1.000000e+00)
  %l0 = call <4 x float> @llvm.fma.v4f32(<4 x float> %d0, <4 x float> splat (float 0x3F8DE8D040000000), <4 x float> splat (float 0xBFB3715EE0000000))
  %l1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %l0, <4 x float> %d0, <4 x float> splat (float 0x3FC82778A0000000))
  %l2 = call <4 x float> @llvm.fma.v4f32(<4 x float> %l1, <4 x float> %d0, <4 x float> splat (float 0xBFD493F7E0000000))
  %l3 = call <4 x float> @llvm.fma.v4f32(<4 x float> %l2, <4 x float> %d0, <4 x float> splat (float 0x3FDE311220000000))
  %l4 = call <4 x float> @llvm.fma.v4f32(<4 x float> %l3, <4 x float> %d0, <4 x float> splat (float 0xBFE70BF2A0000000))
  %l5 = call <4 x float> @llvm.fma.v4f32(<4 x float> %l4, <4 x float> %d0, <4 x float> splat (float 0x3FF71507C0000000))

  %d1 = fadd <4 x float> %a1, splat (float -1.000000e+00)
  %h0 = call <4 x float> @llvm.fma.v4f32(<4 x float> %d1, <4 x float> splat (float 0x3F8DE8D040000000), <4 x float> splat (float 0xBFB3715EE0000000))
  %h1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %h0, <4 x float> %d1, <4 x float> splat (float 0x3FC82778A0000000))
  %h2 = call <4 x float> @llvm.fma.v4f32(<4 x float> %h1, <4 x float> %d1, <4 x float> splat (float 0xBFD493F7E0000000))
  %h3 = call <4 x float> @llvm.fma.v4f32(<4 x float> %h2, <4 x float> %d1, <4 x float> splat (float 0x3FDE311220000000))
  %h4 = call <4 x float> @llvm.fma.v4f32(<4 x float> %h3, <4 x float> %d1, <4 x float> splat (float 0xBFE70BF2A0000000))
  %h5 = call <4 x float> @llvm.fma.v4f32(<4 x float> %h4, <4 x float> %d1, <4 x float> splat (float 0x3FF71507C0000000))

  %res = shufflevector <4 x float> %l5, <4 x float> %h5, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>

  ret <8 x float> %res
}
```
-passes=vector-combine -mtriple=x86_64-- -mcpu=x86-64-v3
```ll
define <8 x float> @fmachain(<4 x float> %a0, <4 x float> %a1) #0 {
  %d0 = fadd <4 x float> %a0, splat (float -1.000000e+00)
 %d1 = fadd <4 x float> %a1, splat (float -1.000000e+00)
  %1 = shufflevector <4 x float> %d0, <4 x float> %d1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  %2 = call <8 x float> @llvm.fma.v8f32(<8 x float> %1, <8 x float> splat (float 0x3F8DE8D040000000), <8 x float> splat (float 0xBFB3715EE0000000))
  %3 = shufflevector <4 x float> %d0, <4 x float> %d1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  %4 = call <8 x float> @llvm.fma.v8f32(<8 x float> %2, <8 x float> %3, <8 x float> splat (float 0x3FC82778A0000000))
  %5 = shufflevector <4 x float> %d0, <4 x float> %d1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  %6 = call <8 x float> @llvm.fma.v8f32(<8 x float> %4, <8 x float> %5, <8 x float> splat (float 0xBFD493F7E0000000))
  %7 = shufflevector <4 x float> %d0, <4 x float> %d1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  %8 = call <8 x float> @llvm.fma.v8f32(<8 x float> %6, <8 x float> %7, <8 x float> splat (float 0x3FDE311220000000))
  %9 = shufflevector <4 x float> %d0, <4 x float> %d1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  %10 = call <8 x float> @llvm.fma.v8f32(<8 x float> %8, <8 x float> %9, <8 x float> splat (float 0xBFE70BF2A0000000))
  %11 = shufflevector <4 x float> %d0, <4 x float> %d1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  %res = call <8 x float> @llvm.fma.v8f32(<8 x float> %10, <8 x float> %11, <8 x float> splat (float 0x3FF71507C0000000))
  ret <8 x float> %res
}
```
Each time it merges an FMA intrinsic it creates yet another `shufflevector <4 x float> %d0, <4 x float> %d1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>` instruction instead of reusing any existing nodes it has previously created. The multiple uses of the %d and %d1 values prevent them being merged as needed:
```ll
define <8 x float> @fmachain(<4 x float> %a0, <4 x float> %a1) #0 {
  %1 = shufflevector <4 x float> %a0, <4 x float> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  %2 = fadd <8 x float> %1, splat (float -1.000000e+00) 
  %3 = call <8 x float> @llvm.fma.v8f32(<8 x float> %2, <8 x float> splat (float 0x3F8DE8D040000000), <8 x float> splat (float 0xBFB3715EE0000000))
  %4 = call <8 x float> @llvm.fma.v8f32(<8 x float> %3, <8 x float> %3, <8 x float> splat (float 0x3FC82778A0000000))
  %5 = call <8 x float> @llvm.fma.v8f32(<8 x float> %4, <8 x float> %5, <8 x float> splat (float 0xBFD493F7E0000000))
  %6 = call <8 x float> @llvm.fma.v8f32(<8 x float> %5, <8 x float> %7, <8 x float> splat (float 0x3FDE311220000000))
 %7 = call <8 x float> @llvm.fma.v8f32(<8 x float> %6, <8 x float> %9, <8 x float> splat (float 0xBFE70BF2A0000000))
  %res = call <8 x float> @llvm.fma.v8f32(<8 x float> %10, <8 x float> %11, <8 x float> splat (float 0x3FF71507C0000000))
  ret <8 x float> %res
}
```
We don't need to even up with a full implementation of EarlyCSE, but it would be good to have a basic numbering system paired with the InstructionWorkList to at least avoid repeatedly creating the same instruction over and over again.
_______________________________________________
llvm-bugs mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs

Reply via email to