Issue 122706
Summary Global ISel packing earlier with many masks
Labels llvm:globalisel, mlir:amdgpu
Assignees
Reporter tpopp
    I've reduced the function a lot to hopefully make the information more useful, but in the full function, this has been more noticeably excessive. This is using commands like `llc -O3 -march=amdgcn -mcpu=gfx942  -mtriple amdgcn-amd-hmcsa -global-isel={true,false}`.

Both cases pack inputs and use `v_pk_fma_f16` instructions, but global isel will put them early and mask the values to get high/low words for various other instructions, resulting in a lot of extra masking computations, while sd-isel inserts them just before the fma calls. I haven't yet seen if there is some heuristic that could be tweaked to tradeoff the cost of extra masking.

```
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
target triple = "amdgcn-amd-amdhsa"

define amdgpu_kernel void @"main$async_dispatch_157_elementwise_2x1024x5120_f16xf16xf16xf32xi8"(<4 x half> %i37) {
bb:
  %i53 = fcmp olt <4 x half> %i37, zeroinitializer
 %i54 = select <4 x i1> %i53, <4 x half> zeroinitializer, <4 x half> splat (half 0xH9AC3)
  %i55 = select <4 x i1> %i53, <4 x half> splat (half 0xH3C00), <4 x half> zeroinitializer
  %i57 = select <4 x i1> %i53, <4 x half> zeroinitializer, <4 x half> splat (half 0xH95CA)
  %i59 = select <4 x i1> %i53, <4 x half> zeroinitializer, <4 x half> splat (half 0xH7E00)
  %i63 = select <4 x i1> %i53, <4 x half> zeroinitializer, <4 x half> splat (half 0xH3C00)
  %i66 = tail call <4 x half> @llvm.fma.v4f16(<4 x half> zeroinitializer, <4 x half> %i59, <4 x half> %i57)
  %i67 = tail call <4 x half> @llvm.fma.v4f16(<4 x half> zeroinitializer, <4 x half> %i66, <4 x half> %i55)
  %i68 = tail call <4 x half> @llvm.fma.v4f16(<4 x half> zeroinitializer, <4 x half> %i67, <4 x half> %i54)
  %i74 = fadd <4 x half> %i63, %i68
  %i87 = tail call <4 x half> @llvm.roundeven.v4f16(<4 x half> %i74)
  %.inv = fcmp oge <4 x half> %i87, splat (half 0xHD800)
  %i88 = select <4 x i1> %.inv, <4 x half> %i87, <4 x half> splat (half 0xHD800)
 %i90 = fptosi <4 x half> %i88 to <4 x i8>
  store <4 x i8> %i90, ptr addrspace(1) null, align 1
  ret void
}

; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare <4 x half> @llvm.roundeven.v4f16(<4 x half>) #0

; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare <4 x half> @llvm.fma.v4f16(<4 x half>, <4 x half>, <4 x half>) #0

; uselistorder directives
uselistorder ptr @llvm.fma.v4f16, { 2, 1, 0 }

attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
```

[reduced.gisel.txt](https://github.com/user-attachments/files/18397297/reduced.gisel.txt)
[reduced.sdisel.txt](https://github.com/user-attachments/files/18397295/reduced.sdisel.txt)
[reduced.txt](https://github.com/user-attachments/files/18397296/reduced.txt)
_______________________________________________
llvm-bugs mailing list
llvm-bugs@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs

Reply via email to