https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/127563
>From f3accf64d3fa2c8a7bc64f7d9b7bd6c02793f005 Mon Sep 17 00:00:00 2001 From: Matt Arsenault <matthew.arsena...@amd.com> Date: Mon, 17 Feb 2025 22:31:48 +0700 Subject: [PATCH] AMDGPU: Fix overly conservative immediate operand check The real legality check is peformed later anyway, so this was unnecessarily blocking immediate folds in handled cases. This also stops folding s_fmac_f32 to s_fmamk_f32 in a few tests, but that seems better. The globalisel changes look suspicious, it may be mishandling constants for VOP3P instructions. --- llvm/lib/Target/AMDGPU/SIFoldOperands.cpp | 3 +- llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll | 16 ++--- .../CodeGen/AMDGPU/GlobalISel/flat-scratch.ll | 60 +++++-------------- llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll | 16 ++--- llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll | 4 +- llvm/test/CodeGen/AMDGPU/bug-cselect-b64.ll | 6 +- llvm/test/CodeGen/AMDGPU/constrained-shift.ll | 6 +- llvm/test/CodeGen/AMDGPU/flat-scratch.ll | 31 +++------- llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll | 57 ++++++------------ .../AMDGPU/fold-operands-frame-index.mir | 3 +- .../AMDGPU/fold-operands-scalar-fmac.mir | 13 ++-- .../CodeGen/AMDGPU/fold-sgpr-multi-imm.mir | 8 +-- llvm/test/CodeGen/AMDGPU/global-saddr-load.ll | 5 +- .../local-stack-alloc-block-sp-reference.ll | 25 +++----- llvm/test/CodeGen/AMDGPU/packed-fp32.ll | 10 ++-- llvm/test/CodeGen/AMDGPU/scalar-float-sop2.ll | 4 +- 16 files changed, 85 insertions(+), 182 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 36288d43443ca..3a019dbaad02c 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -821,7 +821,8 @@ bool SIFoldOperandsImpl::tryToFoldACImm( if (UseOpIdx >= Desc.getNumOperands()) return false; - if (!AMDGPU::isSISrcInlinableOperand(Desc, UseOpIdx)) + // Filter out unhandled pseudos. + if (!AMDGPU::isSISrcOperand(Desc, UseOpIdx)) return false; uint8_t OpTy = Desc.operands()[UseOpIdx].OperandType; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll index 4be00fedb972e..89078f20f1d47 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll @@ -920,9 +920,7 @@ define amdgpu_ps i64 @s_andn2_v4i16(<4 x i16> inreg %src0, <4 x i16> inreg %src1 ; GFX6-NEXT: s_lshl_b32 s3, s9, 16 ; GFX6-NEXT: s_and_b32 s4, s8, 0xffff ; GFX6-NEXT: s_or_b32 s3, s3, s4 -; GFX6-NEXT: s_mov_b32 s4, -1 -; GFX6-NEXT: s_mov_b32 s5, s4 -; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5] +; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], -1 ; GFX6-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] ; GFX6-NEXT: ; return to shader part epilog ; @@ -962,9 +960,7 @@ define amdgpu_ps i64 @s_andn2_v4i16_commute(<4 x i16> inreg %src0, <4 x i16> inr ; GFX6-NEXT: s_lshl_b32 s3, s9, 16 ; GFX6-NEXT: s_and_b32 s4, s8, 0xffff ; GFX6-NEXT: s_or_b32 s3, s3, s4 -; GFX6-NEXT: s_mov_b32 s4, -1 -; GFX6-NEXT: s_mov_b32 s5, s4 -; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5] +; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], -1 ; GFX6-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1] ; GFX6-NEXT: ; return to shader part epilog ; @@ -1004,9 +1000,7 @@ define amdgpu_ps { i64, i64 } @s_andn2_v4i16_multi_use(<4 x i16> inreg %src0, <4 ; GFX6-NEXT: s_lshl_b32 s3, s9, 16 ; GFX6-NEXT: s_and_b32 s4, s8, 0xffff ; GFX6-NEXT: s_or_b32 s3, s3, s4 -; GFX6-NEXT: s_mov_b32 s4, -1 -; GFX6-NEXT: s_mov_b32 s5, s4 -; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5] +; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], -1 ; GFX6-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] ; GFX6-NEXT: ; return to shader part epilog ; @@ -1060,9 +1054,7 @@ define amdgpu_ps { i64, i64 } @s_andn2_v4i16_multi_foldable_use(<4 x i16> inreg ; GFX6-NEXT: s_lshl_b32 s5, s13, 16 ; GFX6-NEXT: s_and_b32 s6, s12, 0xffff ; GFX6-NEXT: s_or_b32 s5, s5, s6 -; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: s_mov_b32 s7, s6 -; GFX6-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7] +; GFX6-NEXT: s_xor_b64 s[4:5], s[4:5], -1 ; GFX6-NEXT: s_and_b64 s[0:1], s[0:1], s[4:5] ; GFX6-NEXT: s_and_b64 s[2:3], s[2:3], s[4:5] ; GFX6-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll index 38346dd568694..a02e0b37479a0 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll @@ -1769,9 +1769,8 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; GFX9-NEXT: s_mov_b32 s0, 0 ; GFX9-NEXT: scratch_store_dword off, v0, s0 offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_movk_i32 s0, 0x3e80 ; GFX9-NEXT: v_mov_b32_e32 v0, 15 -; GFX9-NEXT: s_add_i32 s0, s0, 4 +; GFX9-NEXT: s_movk_i32 s0, 0x3e84 ; GFX9-NEXT: scratch_store_dword off, v0, s0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: scratch_load_dword v0, off, s0 glc @@ -1786,8 +1785,7 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9 ; GFX10-NEXT: v_mov_b32_e32 v0, 13 ; GFX10-NEXT: v_mov_b32_e32 v1, 15 -; GFX10-NEXT: s_movk_i32 s0, 0x3e80 -; GFX10-NEXT: s_add_i32 s0, s0, 4 +; GFX10-NEXT: s_movk_i32 s0, 0x3e84 ; GFX10-NEXT: scratch_store_dword off, v0, off offset:4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_store_dword off, v1, s0 @@ -1799,11 +1797,10 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; GFX942-LABEL: store_load_large_imm_offset_kernel: ; GFX942: ; %bb.0: ; %bb ; GFX942-NEXT: v_mov_b32_e32 v0, 13 -; GFX942-NEXT: s_movk_i32 s0, 0x3e80 ; GFX942-NEXT: scratch_store_dword off, v0, off offset:4 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v0, 15 -; GFX942-NEXT: s_add_i32 s0, s0, 4 +; GFX942-NEXT: s_movk_i32 s0, 0x3e84 ; GFX942-NEXT: scratch_store_dword off, v0, s0 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 @@ -1813,9 +1810,7 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; GFX11-LABEL: store_load_large_imm_offset_kernel: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15 -; GFX11-NEXT: s_movk_i32 s0, 0x3e80 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_add_i32 s0, s0, 4 +; GFX11-NEXT: s_movk_i32 s0, 0x3e84 ; GFX11-NEXT: scratch_store_b32 off, v0, off offset:4 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: scratch_store_b32 off, v1, s0 dlc @@ -1843,9 +1838,8 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; UNALIGNED_GFX9-NEXT: s_mov_b32 s0, 0 ; UNALIGNED_GFX9-NEXT: scratch_store_dword off, v0, s0 offset:4 ; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX9-NEXT: s_movk_i32 s0, 0x3e80 ; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v0, 15 -; UNALIGNED_GFX9-NEXT: s_add_i32 s0, s0, 4 +; UNALIGNED_GFX9-NEXT: s_movk_i32 s0, 0x3e84 ; UNALIGNED_GFX9-NEXT: scratch_store_dword off, v0, s0 ; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX9-NEXT: scratch_load_dword v0, off, s0 glc @@ -1860,8 +1854,7 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; UNALIGNED_GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9 ; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v0, 13 ; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v1, 15 -; UNALIGNED_GFX10-NEXT: s_movk_i32 s0, 0x3e80 -; UNALIGNED_GFX10-NEXT: s_add_i32 s0, s0, 4 +; UNALIGNED_GFX10-NEXT: s_movk_i32 s0, 0x3e84 ; UNALIGNED_GFX10-NEXT: scratch_store_dword off, v0, off offset:4 ; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; UNALIGNED_GFX10-NEXT: scratch_store_dword off, v1, s0 @@ -1873,11 +1866,10 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; UNALIGNED_GFX942-LABEL: store_load_large_imm_offset_kernel: ; UNALIGNED_GFX942: ; %bb.0: ; %bb ; UNALIGNED_GFX942-NEXT: v_mov_b32_e32 v0, 13 -; UNALIGNED_GFX942-NEXT: s_movk_i32 s0, 0x3e80 ; UNALIGNED_GFX942-NEXT: scratch_store_dword off, v0, off offset:4 sc0 sc1 ; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX942-NEXT: v_mov_b32_e32 v0, 15 -; UNALIGNED_GFX942-NEXT: s_add_i32 s0, s0, 4 +; UNALIGNED_GFX942-NEXT: s_movk_i32 s0, 0x3e84 ; UNALIGNED_GFX942-NEXT: scratch_store_dword off, v0, s0 sc0 sc1 ; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX942-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 @@ -1887,9 +1879,7 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; UNALIGNED_GFX11-LABEL: store_load_large_imm_offset_kernel: ; UNALIGNED_GFX11: ; %bb.0: ; %bb ; UNALIGNED_GFX11-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15 -; UNALIGNED_GFX11-NEXT: s_movk_i32 s0, 0x3e80 -; UNALIGNED_GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; UNALIGNED_GFX11-NEXT: s_add_i32 s0, s0, 4 +; UNALIGNED_GFX11-NEXT: s_movk_i32 s0, 0x3e84 ; UNALIGNED_GFX11-NEXT: scratch_store_b32 off, v0, off offset:4 dlc ; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; UNALIGNED_GFX11-NEXT: scratch_store_b32 off, v1, s0 dlc @@ -1923,13 +1913,11 @@ define void @store_load_large_imm_offset_foo() { ; GFX9-LABEL: store_load_large_imm_offset_foo: ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_movk_i32 s0, 0x3e80 ; GFX9-NEXT: v_mov_b32_e32 v0, 13 -; GFX9-NEXT: s_add_i32 s1, s32, s0 ; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, 15 -; GFX9-NEXT: s_add_i32 s0, s1, 4 +; GFX9-NEXT: s_add_i32 s0, s32, 0x3e84 ; GFX9-NEXT: scratch_store_dword off, v0, s0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: scratch_load_dword v0, off, s0 glc @@ -1940,10 +1928,8 @@ define void @store_load_large_imm_offset_foo() { ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, 13 -; GFX10-NEXT: s_movk_i32 s0, 0x3e80 ; GFX10-NEXT: v_mov_b32_e32 v1, 15 -; GFX10-NEXT: s_add_i32 s1, s32, s0 -; GFX10-NEXT: s_add_i32 s0, s1, 4 +; GFX10-NEXT: s_add_i32 s0, s32, 0x3e84 ; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_store_dword off, v1, s0 @@ -1955,13 +1941,11 @@ define void @store_load_large_imm_offset_foo() { ; GFX942-LABEL: store_load_large_imm_offset_foo: ; GFX942: ; %bb.0: ; %bb ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: s_movk_i32 s0, 0x3e80 ; GFX942-NEXT: v_mov_b32_e32 v0, 13 -; GFX942-NEXT: s_add_i32 s1, s32, s0 ; GFX942-NEXT: scratch_store_dword off, v0, s32 offset:4 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v0, 15 -; GFX942-NEXT: s_add_i32 s0, s1, 4 +; GFX942-NEXT: s_add_i32 s0, s32, 0x3e84 ; GFX942-NEXT: scratch_store_dword off, v0, s0 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 @@ -1972,10 +1956,7 @@ define void @store_load_large_imm_offset_foo() { ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15 -; GFX11-NEXT: s_movk_i32 s0, 0x3e80 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_add_i32 s1, s32, s0 -; GFX11-NEXT: s_add_i32 s0, s1, 4 +; GFX11-NEXT: s_add_i32 s0, s32, 0x3e84 ; GFX11-NEXT: scratch_store_b32 off, v0, s32 offset:4 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: scratch_store_b32 off, v1, s0 dlc @@ -2004,13 +1985,11 @@ define void @store_load_large_imm_offset_foo() { ; UNALIGNED_GFX9-LABEL: store_load_large_imm_offset_foo: ; UNALIGNED_GFX9: ; %bb.0: ; %bb ; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; UNALIGNED_GFX9-NEXT: s_movk_i32 s0, 0x3e80 ; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v0, 13 -; UNALIGNED_GFX9-NEXT: s_add_i32 s1, s32, s0 ; UNALIGNED_GFX9-NEXT: scratch_store_dword off, v0, s32 offset:4 ; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v0, 15 -; UNALIGNED_GFX9-NEXT: s_add_i32 s0, s1, 4 +; UNALIGNED_GFX9-NEXT: s_add_i32 s0, s32, 0x3e84 ; UNALIGNED_GFX9-NEXT: scratch_store_dword off, v0, s0 ; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX9-NEXT: scratch_load_dword v0, off, s0 glc @@ -2021,10 +2000,8 @@ define void @store_load_large_imm_offset_foo() { ; UNALIGNED_GFX10: ; %bb.0: ; %bb ; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v0, 13 -; UNALIGNED_GFX10-NEXT: s_movk_i32 s0, 0x3e80 ; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v1, 15 -; UNALIGNED_GFX10-NEXT: s_add_i32 s1, s32, s0 -; UNALIGNED_GFX10-NEXT: s_add_i32 s0, s1, 4 +; UNALIGNED_GFX10-NEXT: s_add_i32 s0, s32, 0x3e84 ; UNALIGNED_GFX10-NEXT: scratch_store_dword off, v0, s32 offset:4 ; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; UNALIGNED_GFX10-NEXT: scratch_store_dword off, v1, s0 @@ -2036,13 +2013,11 @@ define void @store_load_large_imm_offset_foo() { ; UNALIGNED_GFX942-LABEL: store_load_large_imm_offset_foo: ; UNALIGNED_GFX942: ; %bb.0: ; %bb ; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; UNALIGNED_GFX942-NEXT: s_movk_i32 s0, 0x3e80 ; UNALIGNED_GFX942-NEXT: v_mov_b32_e32 v0, 13 -; UNALIGNED_GFX942-NEXT: s_add_i32 s1, s32, s0 ; UNALIGNED_GFX942-NEXT: scratch_store_dword off, v0, s32 offset:4 sc0 sc1 ; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX942-NEXT: v_mov_b32_e32 v0, 15 -; UNALIGNED_GFX942-NEXT: s_add_i32 s0, s1, 4 +; UNALIGNED_GFX942-NEXT: s_add_i32 s0, s32, 0x3e84 ; UNALIGNED_GFX942-NEXT: scratch_store_dword off, v0, s0 sc0 sc1 ; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX942-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 @@ -2053,10 +2028,7 @@ define void @store_load_large_imm_offset_foo() { ; UNALIGNED_GFX11: ; %bb.0: ; %bb ; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; UNALIGNED_GFX11-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15 -; UNALIGNED_GFX11-NEXT: s_movk_i32 s0, 0x3e80 -; UNALIGNED_GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; UNALIGNED_GFX11-NEXT: s_add_i32 s1, s32, s0 -; UNALIGNED_GFX11-NEXT: s_add_i32 s0, s1, 4 +; UNALIGNED_GFX11-NEXT: s_add_i32 s0, s32, 0x3e84 ; UNALIGNED_GFX11-NEXT: scratch_store_b32 off, v0, s32 offset:4 dlc ; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; UNALIGNED_GFX11-NEXT: scratch_store_b32 off, v1, s0 dlc diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll index e7119c89ac06c..065fadf3b5ef3 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll @@ -919,9 +919,7 @@ define amdgpu_ps i64 @s_orn2_v4i16(<4 x i16> inreg %src0, <4 x i16> inreg %src1) ; GFX6-NEXT: s_lshl_b32 s3, s9, 16 ; GFX6-NEXT: s_and_b32 s4, s8, 0xffff ; GFX6-NEXT: s_or_b32 s3, s3, s4 -; GFX6-NEXT: s_mov_b32 s4, -1 -; GFX6-NEXT: s_mov_b32 s5, s4 -; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5] +; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], -1 ; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] ; GFX6-NEXT: ; return to shader part epilog ; @@ -961,9 +959,7 @@ define amdgpu_ps i64 @s_orn2_v4i16_commute(<4 x i16> inreg %src0, <4 x i16> inre ; GFX6-NEXT: s_lshl_b32 s3, s9, 16 ; GFX6-NEXT: s_and_b32 s4, s8, 0xffff ; GFX6-NEXT: s_or_b32 s3, s3, s4 -; GFX6-NEXT: s_mov_b32 s4, -1 -; GFX6-NEXT: s_mov_b32 s5, s4 -; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5] +; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], -1 ; GFX6-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] ; GFX6-NEXT: ; return to shader part epilog ; @@ -1003,9 +999,7 @@ define amdgpu_ps { i64, i64 } @s_orn2_v4i16_multi_use(<4 x i16> inreg %src0, <4 ; GFX6-NEXT: s_lshl_b32 s3, s9, 16 ; GFX6-NEXT: s_and_b32 s4, s8, 0xffff ; GFX6-NEXT: s_or_b32 s3, s3, s4 -; GFX6-NEXT: s_mov_b32 s4, -1 -; GFX6-NEXT: s_mov_b32 s5, s4 -; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5] +; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], -1 ; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] ; GFX6-NEXT: ; return to shader part epilog ; @@ -1059,9 +1053,7 @@ define amdgpu_ps { i64, i64 } @s_orn2_v4i16_multi_foldable_use(<4 x i16> inreg % ; GFX6-NEXT: s_lshl_b32 s5, s13, 16 ; GFX6-NEXT: s_and_b32 s6, s12, 0xffff ; GFX6-NEXT: s_or_b32 s5, s5, s6 -; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: s_mov_b32 s7, s6 -; GFX6-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7] +; GFX6-NEXT: s_xor_b64 s[4:5], s[4:5], -1 ; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] ; GFX6-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] ; GFX6-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll index ed85fb19d9051..43322b1e23412 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll @@ -118,13 +118,11 @@ define amdgpu_ps i64 @scalar_xnor_v4i16_one_use(<4 x i16> inreg %a, <4 x i16> in ; GFX7-NEXT: s_xor_b64 s[2:3], s[2:3], s[6:7] ; GFX7-NEXT: s_lshl_b32 s1, s1, 16 ; GFX7-NEXT: s_and_b32 s0, s0, 0xffff -; GFX7-NEXT: s_mov_b32 s8, -1 ; GFX7-NEXT: s_or_b32 s0, s1, s0 ; GFX7-NEXT: s_lshl_b32 s1, s3, 16 ; GFX7-NEXT: s_and_b32 s2, s2, 0xffff -; GFX7-NEXT: s_mov_b32 s9, s8 ; GFX7-NEXT: s_or_b32 s1, s1, s2 -; GFX7-NEXT: s_xor_b64 s[0:1], s[0:1], s[8:9] +; GFX7-NEXT: s_xor_b64 s[0:1], s[0:1], -1 ; GFX7-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: scalar_xnor_v4i16_one_use: diff --git a/llvm/test/CodeGen/AMDGPU/bug-cselect-b64.ll b/llvm/test/CodeGen/AMDGPU/bug-cselect-b64.ll index f6fc69a6e3e47..ea93e3ac1e595 100644 --- a/llvm/test/CodeGen/AMDGPU/bug-cselect-b64.ll +++ b/llvm/test/CodeGen/AMDGPU/bug-cselect-b64.ll @@ -5,16 +5,14 @@ define amdgpu_cs <2 x i32> @f() { ; CHECK-LABEL: f: ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_mov_b32 s4, 0 +; CHECK-NEXT: s_mov_b32 s1, 0 ; CHECK-NEXT: s_mov_b32 s5, s4 ; CHECK-NEXT: s_mov_b32 s6, s4 ; CHECK-NEXT: s_mov_b32 s7, s4 -; CHECK-NEXT: s_mov_b32 s0, s4 ; CHECK-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 -; CHECK-NEXT: s_mov_b32 s1, s4 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, s[0:1], v[0:1] +; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; CHECK-NEXT: v_mov_b32_e32 v1, s4 -; CHECK-NEXT: s_mov_b32 s1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 ; CHECK-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 diff --git a/llvm/test/CodeGen/AMDGPU/constrained-shift.ll b/llvm/test/CodeGen/AMDGPU/constrained-shift.ll index 4011c21af6904..661af021e8a84 100644 --- a/llvm/test/CodeGen/AMDGPU/constrained-shift.ll +++ b/llvm/test/CodeGen/AMDGPU/constrained-shift.ll @@ -192,10 +192,8 @@ define amdgpu_ps <4 x i32> @s_csh_v4i32(<4 x i32> inreg %a, <4 x i32> inreg %b) ; ; GISEL-LABEL: s_csh_v4i32: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_mov_b32 s8, 31 -; GISEL-NEXT: s_mov_b32 s9, s8 -; GISEL-NEXT: s_and_b64 s[4:5], s[4:5], s[8:9] -; GISEL-NEXT: s_and_b64 s[6:7], s[6:7], s[8:9] +; GISEL-NEXT: s_and_b64 s[4:5], s[4:5], 31 +; GISEL-NEXT: s_and_b64 s[6:7], s[6:7], 31 ; GISEL-NEXT: s_lshl_b32 s8, s0, s4 ; GISEL-NEXT: s_lshl_b32 s9, s1, s5 ; GISEL-NEXT: s_lshl_b32 s10, s2, s6 diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll index 4cfdb968f7090..e7c8604776ce0 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll @@ -3498,8 +3498,7 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; GFX9-NEXT: s_mov_b32 s0, 0 ; GFX9-NEXT: scratch_store_dword off, v0, s0 offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_movk_i32 s0, 0x3000 -; GFX9-NEXT: s_add_i32 s0, s0, 4 +; GFX9-NEXT: s_movk_i32 s0, 0x3004 ; GFX9-NEXT: v_mov_b32_e32 v0, 15 ; GFX9-NEXT: scratch_store_dword off, v0, s0 offset:3712 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -3515,8 +3514,7 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9 ; GFX10-NEXT: v_mov_b32_e32 v0, 13 ; GFX10-NEXT: v_mov_b32_e32 v1, 15 -; GFX10-NEXT: s_movk_i32 s0, 0x3800 -; GFX10-NEXT: s_add_i32 s0, s0, 4 +; GFX10-NEXT: s_movk_i32 s0, 0x3804 ; GFX10-NEXT: scratch_store_dword off, v0, off offset:4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_store_dword off, v1, s0 offset:1664 @@ -3561,8 +3559,7 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-PAL-NEXT: scratch_store_dword off, v0, s0 offset:4 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX9-PAL-NEXT: s_movk_i32 s0, 0x3000 -; GFX9-PAL-NEXT: s_add_i32 s0, s0, 4 +; GFX9-PAL-NEXT: s_movk_i32 s0, 0x3004 ; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 ; GFX9-PAL-NEXT: scratch_store_dword off, v0, s0 offset:3712 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) @@ -3596,9 +3593,8 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 13 ; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, 15 -; GFX1010-PAL-NEXT: s_movk_i32 s0, 0x3800 ; GFX1010-PAL-NEXT: s_mov_b32 s1, 0 -; GFX1010-PAL-NEXT: s_add_i32 s0, s0, 4 +; GFX1010-PAL-NEXT: s_movk_i32 s0, 0x3804 ; GFX1010-PAL-NEXT: scratch_store_dword off, v0, s1 offset:4 ; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1010-PAL-NEXT: scratch_store_dword off, v1, s0 offset:1664 @@ -3620,8 +3616,7 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 13 ; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, 15 -; GFX1030-PAL-NEXT: s_movk_i32 s0, 0x3800 -; GFX1030-PAL-NEXT: s_add_i32 s0, s0, 4 +; GFX1030-PAL-NEXT: s_movk_i32 s0, 0x3804 ; GFX1030-PAL-NEXT: scratch_store_dword off, v0, off offset:4 ; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1030-PAL-NEXT: scratch_store_dword off, v1, s0 offset:1664 @@ -3667,12 +3662,10 @@ define void @store_load_large_imm_offset_foo() { ; GFX9-LABEL: store_load_large_imm_offset_foo: ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_movk_i32 s0, 0x3000 ; GFX9-NEXT: v_mov_b32_e32 v0, 13 -; GFX9-NEXT: s_add_i32 s1, s32, s0 ; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_add_i32 s0, s1, 4 +; GFX9-NEXT: s_add_i32 s0, s32, 0x3004 ; GFX9-NEXT: v_mov_b32_e32 v0, 15 ; GFX9-NEXT: scratch_store_dword off, v0, s0 offset:3712 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -3684,10 +3677,8 @@ define void @store_load_large_imm_offset_foo() { ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, 13 -; GFX10-NEXT: s_movk_i32 s0, 0x3800 ; GFX10-NEXT: v_mov_b32_e32 v1, 15 -; GFX10-NEXT: s_add_i32 s1, s32, s0 -; GFX10-NEXT: s_add_i32 s0, s1, 4 +; GFX10-NEXT: s_add_i32 s0, s32, 0x3804 ; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_store_dword off, v1, s0 offset:1664 @@ -3729,12 +3720,10 @@ define void @store_load_large_imm_offset_foo() { ; GFX9-PAL-LABEL: store_load_large_imm_offset_foo: ; GFX9-PAL: ; %bb.0: ; %bb ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-PAL-NEXT: s_movk_i32 s0, 0x3000 ; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 13 -; GFX9-PAL-NEXT: s_add_i32 s1, s32, s0 ; GFX9-PAL-NEXT: scratch_store_dword off, v0, s32 offset:4 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX9-PAL-NEXT: s_add_i32 s0, s1, 4 +; GFX9-PAL-NEXT: s_add_i32 s0, s32, 0x3004 ; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 ; GFX9-PAL-NEXT: scratch_store_dword off, v0, s0 offset:3712 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) @@ -3760,10 +3749,8 @@ define void @store_load_large_imm_offset_foo() { ; GFX10-PAL: ; %bb.0: ; %bb ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 13 -; GFX10-PAL-NEXT: s_movk_i32 s0, 0x3800 ; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15 -; GFX10-PAL-NEXT: s_add_i32 s1, s32, s0 -; GFX10-PAL-NEXT: s_add_i32 s0, s1, 4 +; GFX10-PAL-NEXT: s_add_i32 s0, s32, 0x3804 ; GFX10-PAL-NEXT: scratch_store_dword off, v0, s32 offset:4 ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-PAL-NEXT: scratch_store_dword off, v1, s0 offset:1664 diff --git a/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll b/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll index 9ae60f99d5e09..8c91acd5ae024 100644 --- a/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll +++ b/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll @@ -2106,33 +2106,12 @@ define <2 x double> @v_fma_mul_add_32_v2f64(<2 x double> %x, <2 x double> %y) { ; GFX9-NEXT: v_fma_f64 v[2:3], v[2:3], s[4:5], v[6:7] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-SDAG-LABEL: v_fma_mul_add_32_v2f64: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_fma_f64 v[0:1], 0x40400000, v[0:1], v[4:5] -; GFX10-SDAG-NEXT: v_fma_f64 v[2:3], 0x40400000, v[2:3], v[6:7] -; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-GISEL-LABEL: v_fma_mul_add_32_v2f64: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_fma_f64 v[0:1], v[0:1], 0x40400000, v[4:5] -; GFX10-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], 0x40400000, v[6:7] -; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-SDAG-LABEL: v_fma_mul_add_32_v2f64: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_fma_f64 v[0:1], 0x40400000, v[0:1], v[4:5] -; GFX11-SDAG-NEXT: v_fma_f64 v[2:3], 0x40400000, v[2:3], v[6:7] -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-GISEL-LABEL: v_fma_mul_add_32_v2f64: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_fma_f64 v[0:1], v[0:1], 0x40400000, v[4:5] -; GFX11-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], 0x40400000, v[6:7] -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX1011-LABEL: v_fma_mul_add_32_v2f64: +; GFX1011: ; %bb.0: +; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1011-NEXT: v_fma_f64 v[0:1], 0x40400000, v[0:1], v[4:5] +; GFX1011-NEXT: v_fma_f64 v[2:3], 0x40400000, v[2:3], v[6:7] +; GFX1011-NEXT: s_setpc_b64 s[30:31] %mul = fmul contract <2 x double> %x, <double 32.0, double 32.0> %fma = fadd contract <2 x double> %mul, %y ret <2 x double> %fma @@ -2490,8 +2469,8 @@ define <2 x double> @v_mul_16_v2f64(<2 x double> %x) { ; GFX10-GISEL-LABEL: v_mul_16_v2f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], 0x40300000 -; GFX10-GISEL-NEXT: v_mul_f64 v[2:3], v[2:3], 0x40300000 +; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], 0x40300000, v[0:1] +; GFX10-GISEL-NEXT: v_mul_f64 v[2:3], 0x40300000, v[2:3] ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: v_mul_16_v2f64: @@ -2504,8 +2483,8 @@ define <2 x double> @v_mul_16_v2f64(<2 x double> %x) { ; GFX11-GISEL-LABEL: v_mul_16_v2f64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], 0x40300000 -; GFX11-GISEL-NEXT: v_mul_f64 v[2:3], v[2:3], 0x40300000 +; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], 0x40300000, v[0:1] +; GFX11-GISEL-NEXT: v_mul_f64 v[2:3], 0x40300000, v[2:3] ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %mul = fmul <2 x double> %x, <double 16.0, double 16.0> ret <2 x double> %mul @@ -2538,8 +2517,8 @@ define <2 x double> @v_mul_neg16_v2f64(<2 x double> %x) { ; GFX10-GISEL-LABEL: v_mul_neg16_v2f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], 0xc0300000 -; GFX10-GISEL-NEXT: v_mul_f64 v[2:3], v[2:3], 0xc0300000 +; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], 0xc0300000, v[0:1] +; GFX10-GISEL-NEXT: v_mul_f64 v[2:3], 0xc0300000, v[2:3] ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: v_mul_neg16_v2f64: @@ -2552,8 +2531,8 @@ define <2 x double> @v_mul_neg16_v2f64(<2 x double> %x) { ; GFX11-GISEL-LABEL: v_mul_neg16_v2f64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], v[0:1], 0xc0300000 -; GFX11-GISEL-NEXT: v_mul_f64 v[2:3], v[2:3], 0xc0300000 +; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], 0xc0300000, v[0:1] +; GFX11-GISEL-NEXT: v_mul_f64 v[2:3], 0xc0300000, v[2:3] ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %mul = fmul <2 x double> %x, <double -16.0, double -16.0> ret <2 x double> %mul @@ -2586,8 +2565,8 @@ define <2 x double> @v_mul_fabs_16_v2f64(<2 x double> %x) { ; GFX10-GISEL-LABEL: v_mul_fabs_16_v2f64: ; GFX10-GISEL: ; %bb.0: ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], |v[0:1]|, 0x40300000 -; GFX10-GISEL-NEXT: v_mul_f64 v[2:3], |v[2:3]|, 0x40300000 +; GFX10-GISEL-NEXT: v_mul_f64 v[0:1], 0x40300000, |v[0:1]| +; GFX10-GISEL-NEXT: v_mul_f64 v[2:3], 0x40300000, |v[2:3]| ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: v_mul_fabs_16_v2f64: @@ -2600,8 +2579,8 @@ define <2 x double> @v_mul_fabs_16_v2f64(<2 x double> %x) { ; GFX11-GISEL-LABEL: v_mul_fabs_16_v2f64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], |v[0:1]|, 0x40300000 -; GFX11-GISEL-NEXT: v_mul_f64 v[2:3], |v[2:3]|, 0x40300000 +; GFX11-GISEL-NEXT: v_mul_f64 v[0:1], 0x40300000, |v[0:1]| +; GFX11-GISEL-NEXT: v_mul_f64 v[2:3], 0x40300000, |v[2:3]| ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %x.fabs = call <2 x double> @llvm.fabs.v2f64(<2 x double> %x) %mul = fmul <2 x double> %x.fabs, <double 16.0, double 16.0> diff --git a/llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.mir b/llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.mir index 280126a0d7cd2..6ab1395a0dcca 100644 --- a/llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.mir +++ b/llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.mir @@ -75,8 +75,7 @@ stack: body: | bb.0: ; CHECK-LABEL: name: fold_frame_index__s_add_i32__fi_materializedconst_0 - ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 256 - ; CHECK-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 %stack.0, [[S_MOV_B32_]], implicit-def $scc + ; CHECK: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 %stack.0, 256, implicit-def $scc ; CHECK-NEXT: $sgpr4 = COPY [[S_ADD_I32_]] ; CHECK-NEXT: SI_RETURN implicit $sgpr4 %0:sreg_32 = S_MOV_B32 %stack.0 diff --git a/llvm/test/CodeGen/AMDGPU/fold-operands-scalar-fmac.mir b/llvm/test/CodeGen/AMDGPU/fold-operands-scalar-fmac.mir index 2492eb2982aac..6e52cb0265bed 100644 --- a/llvm/test/CodeGen/AMDGPU/fold-operands-scalar-fmac.mir +++ b/llvm/test/CodeGen/AMDGPU/fold-operands-scalar-fmac.mir @@ -13,7 +13,7 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; CHECK-NEXT: %fma:sreg_32 = nofpexcept S_FMAMK_F32 [[COPY]], 1056964608, [[COPY1]], implicit $mode + ; CHECK-NEXT: %fma:sreg_32 = nofpexcept S_FMAC_F32 1056964608, [[COPY]], [[COPY1]], implicit $mode ; CHECK-NEXT: $sgpr0 = COPY %fma %0:sreg_32 = COPY $sgpr0 %1:sreg_32 = COPY $sgpr1 @@ -33,7 +33,7 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; CHECK-NEXT: %fma:sreg_32 = nofpexcept S_FMAMK_F32 [[COPY]], 1056964608, [[COPY1]], implicit $mode + ; CHECK-NEXT: %fma:sreg_32 = nofpexcept S_FMAC_F32 [[COPY]], 1056964608, [[COPY1]], implicit $mode ; CHECK-NEXT: $sgpr0 = COPY %fma %0:sreg_32 = COPY $sgpr0 %1:sreg_32 = COPY $sgpr1 @@ -73,7 +73,7 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; CHECK-NEXT: %fma:sreg_32 = nofpexcept S_FMAMK_F32 [[COPY]], 1234567890, [[COPY1]], implicit $mode + ; CHECK-NEXT: %fma:sreg_32 = nofpexcept S_FMAC_F32 1234567890, [[COPY]], [[COPY1]], implicit $mode ; CHECK-NEXT: $sgpr0 = COPY %fma %0:sreg_32 = COPY $sgpr0 %1:sreg_32 = COPY $sgpr1 @@ -93,7 +93,7 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; CHECK-NEXT: %fma:sreg_32 = nofpexcept S_FMAMK_F32 [[COPY]], 1234567890, [[COPY1]], implicit $mode + ; CHECK-NEXT: %fma:sreg_32 = nofpexcept S_FMAC_F32 [[COPY]], 1234567890, [[COPY1]], implicit $mode ; CHECK-NEXT: $sgpr0 = COPY %fma %0:sreg_32 = COPY $sgpr0 %1:sreg_32 = COPY $sgpr1 @@ -212,8 +212,7 @@ body: | ; CHECK: liveins: $sgpr0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; CHECK-NEXT: %noninlinable:sreg_32 = S_MOV_B32 1234567890 - ; CHECK-NEXT: %fma:sreg_32 = nofpexcept S_FMAMK_F32 [[COPY]], 1056964608, %noninlinable, implicit $mode + ; CHECK-NEXT: %fma:sreg_32 = nofpexcept S_FMAAK_F32 [[COPY]], 1056964608, 1234567890, implicit $mode ; CHECK-NEXT: $sgpr0 = COPY %fma %0:sreg_32 = COPY $sgpr0 %inlinable:sreg_32 = S_MOV_B32 1056964608 @@ -232,7 +231,7 @@ body: | ; CHECK: liveins: $sgpr0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; CHECK-NEXT: %fma:sreg_32 = nofpexcept S_FMAMK_F32 1234567890, 1234567890, [[COPY]], implicit $mode + ; CHECK-NEXT: %fma:sreg_32 = nofpexcept S_FMAC_F32 1234567890, 1234567890, [[COPY]], implicit $mode ; CHECK-NEXT: $sgpr0 = COPY %fma %0:sreg_32 = COPY $sgpr0 %noninlinable:sreg_32 = S_MOV_B32 1234567890 diff --git a/llvm/test/CodeGen/AMDGPU/fold-sgpr-multi-imm.mir b/llvm/test/CodeGen/AMDGPU/fold-sgpr-multi-imm.mir index c8afb89aa272a..d85d77e338870 100644 --- a/llvm/test/CodeGen/AMDGPU/fold-sgpr-multi-imm.mir +++ b/llvm/test/CodeGen/AMDGPU/fold-sgpr-multi-imm.mir @@ -46,7 +46,7 @@ body: | %2:sreg_32 = S_LSHL2_ADD_U32 %0, %1, implicit-def $scc ... # GCN-LABEL: name: test_frameindex{{$}} -# GCN: %1:sreg_32 = S_ADD_I32 %stack.0, %0 +# GCN: %1:sreg_32 = S_ADD_I32 %stack.0, 70 --- name: test_frameindex tracksRegLiveness: true @@ -117,7 +117,7 @@ body: | ... # GCN-LABEL: name: test_fold_same_literal_2x{{$}} -# GCN: %2:sreg_32 = S_ADD_I32 70, %1 +# GCN: %2:sreg_32 = S_ADD_I32 70, 70 --- name: test_fold_same_literal_2x tracksRegLiveness: true @@ -129,7 +129,7 @@ body: | ... # GCN-LABEL: name: test_fold_same_literal_lhs{{$}} -# GCN: %1:sreg_32 = S_ADD_I32 70, %0 +# GCN: %1:sreg_32 = S_ADD_I32 70, 70 --- name: test_fold_same_literal_lhs tracksRegLiveness: true @@ -140,7 +140,7 @@ body: | ... # GCN-LABEL: name: test_fold_same_literal_rhs{{$}} -# GCN: %1:sreg_32 = S_ADD_I32 %0, 70 +# GCN: %1:sreg_32 = S_ADD_I32 70, 70 --- name: test_fold_same_literal_rhs tracksRegLiveness: true diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll index 492a30b67089c..bc49f70cbee11 100644 --- a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll +++ b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll @@ -742,10 +742,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_0x100000001(ptr addrspace(1) ; ; GFX12-SDAG-LABEL: global_load_saddr_i8_offset_0x100000001: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_mov_b32 s0, 1 -; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX12-SDAG-NEXT: s_mov_b32 s1, s0 -; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[0:1] +; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[2:3], 1 ; GFX12-SDAG-NEXT: s_load_u8 s0, s[0:1], 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0 diff --git a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll index 0edc7cb01887b..8ec3b7e2508ac 100644 --- a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll +++ b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll @@ -74,8 +74,7 @@ define amdgpu_kernel void @local_stack_offset_uses_sp(ptr addrspace(1) %out) { ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: s_cbranch_scc1 .LBB0_1 ; FLATSCR-NEXT: ; %bb.2: ; %split -; FLATSCR-NEXT: s_movk_i32 s0, 0x2000 -; FLATSCR-NEXT: s_addk_i32 s0, 0x3000 +; FLATSCR-NEXT: s_movk_i32 s0, 0x5000 ; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s0 offset:208 glc ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: s_movk_i32 s0, 0x3000 @@ -176,9 +175,7 @@ define void @func_local_stack_offset_uses_sp(ptr addrspace(1) %out) { ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: s_cbranch_scc1 .LBB1_1 ; FLATSCR-NEXT: ; %bb.2: ; %split -; FLATSCR-NEXT: s_movk_i32 s0, 0x2000 -; FLATSCR-NEXT: s_add_i32 s1, s33, s0 -; FLATSCR-NEXT: s_add_i32 s0, s1, 0x3000 +; FLATSCR-NEXT: s_add_i32 s0, s33, 0x5000 ; FLATSCR-NEXT: scratch_load_dwordx2 v[2:3], off, s0 offset:208 glc ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: s_add_i32 s0, s33, 0x3000 @@ -227,30 +224,25 @@ define amdgpu_kernel void @local_stack_offset_uses_sp_flat(ptr addrspace(1) %out ; MUBUF-NEXT: s_cbranch_scc1 .LBB2_1 ; MUBUF-NEXT: ; %bb.2: ; %split ; MUBUF-NEXT: v_mov_b32_e32 v1, 0x4000 -; MUBUF-NEXT: s_movk_i32 s4, 0x12d4 ; MUBUF-NEXT: v_mov_b32_e32 v2, 0x4000 ; MUBUF-NEXT: v_or_b32_e32 v0, 0x12c0, v1 -; MUBUF-NEXT: v_or_b32_e32 v1, s4, v2 -; MUBUF-NEXT: s_movk_i32 s4, 0x12d0 +; MUBUF-NEXT: v_or_b32_e32 v1, 0x12d4, v2 ; MUBUF-NEXT: v_mov_b32_e32 v2, 0x4000 ; MUBUF-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: v_or_b32_e32 v1, s4, v2 -; MUBUF-NEXT: s_movk_i32 s4, 0x12c4 +; MUBUF-NEXT: v_or_b32_e32 v1, 0x12d0, v2 ; MUBUF-NEXT: v_mov_b32_e32 v2, 0x4000 ; MUBUF-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: v_or_b32_e32 v1, s4, v2 +; MUBUF-NEXT: v_or_b32_e32 v1, 0x12c4, v2 ; MUBUF-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: buffer_load_dword v7, v0, s[0:3], 0 offen glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: s_movk_i32 s4, 0x12cc ; MUBUF-NEXT: v_mov_b32_e32 v1, 0x4000 -; MUBUF-NEXT: v_or_b32_e32 v0, s4, v1 -; MUBUF-NEXT: s_movk_i32 s4, 0x12c8 ; MUBUF-NEXT: v_mov_b32_e32 v2, 0x4000 -; MUBUF-NEXT: v_or_b32_e32 v1, s4, v2 +; MUBUF-NEXT: v_or_b32_e32 v0, 0x12cc, v1 +; MUBUF-NEXT: v_or_b32_e32 v1, 0x12c8, v2 ; MUBUF-NEXT: v_mov_b32_e32 v2, 0x4000 ; MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) @@ -305,8 +297,7 @@ define amdgpu_kernel void @local_stack_offset_uses_sp_flat(ptr addrspace(1) %out ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: s_cbranch_scc1 .LBB2_1 ; FLATSCR-NEXT: ; %bb.2: ; %split -; FLATSCR-NEXT: s_movk_i32 s0, 0x1000 -; FLATSCR-NEXT: s_addk_i32 s0, 0x2000 +; FLATSCR-NEXT: s_movk_i32 s0, 0x3000 ; FLATSCR-NEXT: scratch_load_dwordx2 v[8:9], off, s0 offset:720 glc ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 offset:704 glc diff --git a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll index b59f3c0d410f8..9b03a72fd826d 100644 --- a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll +++ b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll @@ -87,7 +87,7 @@ define amdgpu_kernel void @fadd_v2_v_v_splat(ptr addrspace(1) %a) { ; GCN-LABEL: {{^}}fadd_v2_v_lit_splat: ; GFX900-COUNT-2: v_add_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}} ; PACKED-SDAG: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 1.0 op_sel_hi:[1,0]{{$}} -; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}} +; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 1.0{{$}} define amdgpu_kernel void @fadd_v2_v_lit_splat(ptr addrspace(1) %a) { %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id @@ -308,7 +308,7 @@ define amdgpu_kernel void @fmul_v2_v_v_splat(ptr addrspace(1) %a) { ; GCN-LABEL: {{^}}fmul_v2_v_lit_splat: ; GFX900-COUNT-2: v_mul_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}} ; PACKED-SDAG: v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 4.0 op_sel_hi:[1,0]{{$}} -; PACKED-GISEL: v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}} +; PACKED-GISEL: v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 4.0{{$}} define amdgpu_kernel void @fmul_v2_v_lit_splat(ptr addrspace(1) %a) { %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id @@ -432,7 +432,7 @@ define amdgpu_kernel void @fma_v2_v_v_splat(ptr addrspace(1) %a) { ; GCN-LABEL: {{^}}fma_v2_v_lit_splat: ; GFX900-COUNT-2: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, 4.0, 1.0 ; PACKED-SDAG: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 4.0, 1.0 op_sel_hi:[1,0,0]{{$}} -; PACKED-GISEL: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}} +; PACKED-GISEL: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 4.0, 1.0{{$}} define amdgpu_kernel void @fma_v2_v_lit_splat(ptr addrspace(1) %a) { %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id @@ -556,8 +556,8 @@ bb: ; PACKED-SDAG: v_add_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, 0 ; PACKED-SDAG: v_add_f32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} -; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], s[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}} -; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}} +; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], s[{{[0-9:]+}}], 0{{$}} +; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 0{{$}} define amdgpu_kernel void @fadd_fadd_fsub_0(<2 x float> %arg) { bb: %i12 = fadd <2 x float> zeroinitializer, %arg diff --git a/llvm/test/CodeGen/AMDGPU/scalar-float-sop2.ll b/llvm/test/CodeGen/AMDGPU/scalar-float-sop2.ll index 81d792183dc06..debbfce7dadcc 100644 --- a/llvm/test/CodeGen/AMDGPU/scalar-float-sop2.ll +++ b/llvm/test/CodeGen/AMDGPU/scalar-float-sop2.ll @@ -218,7 +218,7 @@ define amdgpu_ps float @_amdgpu_ps_main() { ; GFX1150-NEXT: s_mov_b32 s3, s0 ; GFX1150-NEXT: s_buffer_load_b64 s[0:1], s[0:3], 0x0 ; GFX1150-NEXT: s_waitcnt lgkmcnt(0) -; GFX1150-NEXT: s_fmamk_f32 s0, s1, 0x40800000, s0 +; GFX1150-NEXT: s_fmac_f32 s0, s1, 4.0 ; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_3) ; GFX1150-NEXT: v_mov_b32_e32 v0, s0 ; GFX1150-NEXT: ; return to shader part epilog @@ -232,7 +232,7 @@ define amdgpu_ps float @_amdgpu_ps_main() { ; GFX12-NEXT: s_mov_b32 s3, s0 ; GFX12-NEXT: s_buffer_load_b64 s[0:1], s[0:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_fmamk_f32 s0, s1, 0x40800000, s0 +; GFX12-NEXT: s_fmac_f32 s0, s1, 4.0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3) ; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: ; return to shader part epilog _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits