Issue 123631
Summary [AMDGPU][GISel] BFI generated instead of a smaller load
Labels backend:AMDGPU, llvm:globalisel
Assignees
Reporter qcolombet
    When using GISel I end up with `bfi` instructions where SDISel uses simpler sequence of instructions.
I've attached a small reproducer and a slightly bigger one because I don't know if the fix will be exactly the same for both cases since the smallest reproducer has all relevant instructions in the same basic block whereas the bigger one does not.

In both cases, the problem stems from the fact that SDISel is able to simplify `extract_subvector (load <8 x half> %addr), high_4_half` into `load <4 x half> %add + 8`, whereas GISel lowers this sequence all the way to ISel without any simplification.

This combine may be worth putting in the generic combiner helper.

# To Reproduce #

Download the attached reproducer or copy/paste the IR at the end.
[repro.ll.txt](https://github.com/user-attachments/files/18479391/repro.ll.txt)
And run:

```bash
llc -O3 -march=amdgcn -mcpu=gfx942  -mtriple amdgcn-amd-hmcsa -global-isel=<0|1> repro.ll -o -
```

# Result #

GISel produces `bfi` instructions whereas SDISel doesn't. (Showing the result only for the smallest reproducer.)

With GISel:
```asm
	s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
	ds_read2_b64 v[2:5], v0 offset1:1
	s_mov_b32 s0, 0xffff
	s_waitcnt lgkmcnt(0)
	v_bfi_b32 v2, s0, v4, v4 <--- these
	v_bfi_b32 v3, s0, v5, v5 <--- these
	ds_write_b64 v1, v[2:3]
	s_waitcnt lgkmcnt(0)
	s_setpc_b64 s[30:31]
```

With SDISel:
```asm
	s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
	ds_read2_b64 v[2:5], v0 offset1:1
	s_waitcnt lgkmcnt(0)
	ds_write_b64 v1, v[4:5]
	s_waitcnt lgkmcnt(0)
	s_setpc_b64 s[30:31]
```

# Note #

Small reproducer:
```llvm
define void @bla(ptr addrspace(3) %in, ptr addrspace(3) %out) {
  %val = load <8 x half>, ptr addrspace(3) %in, align 8
  %res = shufflevector <8 x half> %val, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
  store <4 x half> %res, ptr addrspace(3) %out, align 8
  ret void
}
```

Bigger reproducer (automatically reduced):
```llvm
define amdgpu_kernel void @foo() {
bb:
  %i395.pre = load <8 x half>, ptr addrspace(3) null, align 8
  br label %bb374

bb374: ; preds = %bb374, %bb
  %i375 = phi [1 x [2 x [1 x [4 x [1 x <4 x float>]]]]] [ zeroinitializer, %bb ], [ %i845, %bb374 ]
  %i377 = phi <1 x float> [ zeroinitializer, %bb ], [ %i509, %bb374 ]
  %i414 = shufflevector <8 x half> %i395.pre, <8 x half> zeroinitializer, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
  %i415 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8f16(<4 x half> %i414, <4 x half> zeroinitializer, <16 x float> zeroinitializer, i32 0, i32 0, i32 0)
  %i446 = shufflevector <16 x float> %i415, <16 x float> zeroinitializer, <1 x i32> <i32 6>
  %i509 = fmul <1 x float> %i446, %i377
  %i511 = extractelement <1 x float> %i377, i64 0
  %i665 = insertelement <4 x float> zeroinitializer, float %i511, i64 0
  %i670 = extractvalue [1 x [2 x [1 x [4 x [1 x <4 x float>]]]]] %i375, 0, 1, 0, 0, 0
  %i796 = shufflevector <4 x float> %i670, <4 x float> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  %i844 = insertvalue [1 x [4 x [1 x <4 x float>]]] zeroinitializer, <4 x float> %i665, 0, 3, 0
  %i845 = insertvalue [1 x [2 x [1 x [4 x [1 x <4 x float>]]]]] zeroinitializer, [1 x [4 x [1 x <4 x float>]]] %i844, 0, 1
  br label %bb374
}

; Function Attrs: convergent nocallback nofree nosync nounwind willreturn memory(none)
declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x8f16(<4 x half>, <4 x half>, <16 x float>, i32 immarg, i32 immarg, i32 immarg) #0

attributes #0 = { convergent nocallback nofree nosync nounwind willreturn memory(none) }
```
_______________________________________________
llvm-bugs mailing list
llvm-bugs@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs

Reply via email to