https://github.com/llvmbot updated https://github.com/llvm/llvm-project/pull/128132
>From e6d4fd035fdf90348fbeba6e73f90feb6e66b30b Mon Sep 17 00:00:00 2001 From: Matt Arsenault <matthew.arsena...@amd.com> Date: Fri, 21 Feb 2025 12:08:49 +0700 Subject: [PATCH] AMDGPU: Widen f16 minimum/maximum to v2f16 on gfx950 (#128121) Unfortunately we only have the vector versions of v2f16 minimum3 and maximum. Widen to v2f16 so we can lower as minimum333(x, y, y). (cherry picked from commit e729dc759d052de122c8a918fe51b05ac796bb50) --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 40 +- llvm/lib/Target/AMDGPU/SIISelLowering.h | 1 + llvm/test/CodeGen/AMDGPU/fmaximum3.ll | 689 ++++++++++++------- llvm/test/CodeGen/AMDGPU/fminimum3.ll | 689 ++++++++++++------- llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll | 66 +- llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll | 66 +- 6 files changed, 966 insertions(+), 585 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index e09df53995d61..d45ae7398e25d 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -869,8 +869,13 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, if (Subtarget->hasMinimum3Maximum3F32()) setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::f32, Legal); - if (Subtarget->hasMinimum3Maximum3PKF16()) + if (Subtarget->hasMinimum3Maximum3PKF16()) { setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::v2f16, Legal); + + // If only the vector form is available, we need to widen to a vector. + if (!Subtarget->hasMinimum3Maximum3F16()) + setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::f16, Custom); + } } setOperationAction(ISD::INTRINSIC_WO_CHAIN, @@ -5963,6 +5968,9 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::FMINNUM: case ISD::FMAXNUM: return lowerFMINNUM_FMAXNUM(Op, DAG); + case ISD::FMINIMUM: + case ISD::FMAXIMUM: + return lowerFMINIMUM_FMAXIMUM(Op, DAG); case ISD::FLDEXP: case ISD::STRICT_FLDEXP: return lowerFLDEXP(Op, DAG); @@ -5984,8 +5992,6 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::FMUL: case ISD::FMINNUM_IEEE: case ISD::FMAXNUM_IEEE: - case ISD::FMINIMUM: - case ISD::FMAXIMUM: case ISD::FMINIMUMNUM: case ISD::FMAXIMUMNUM: case ISD::UADDSAT: @@ -6840,6 +6846,34 @@ SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op, return Op; } +SDValue SITargetLowering::lowerFMINIMUM_FMAXIMUM(SDValue Op, + SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + if (VT.isVector()) + return splitBinaryVectorOp(Op, DAG); + + assert(!Subtarget->hasIEEEMinMax() && !Subtarget->hasMinimum3Maximum3F16() && + Subtarget->hasMinimum3Maximum3PKF16() && VT == MVT::f16 && + "should not need to widen f16 minimum/maximum to v2f16"); + + // Widen f16 operation to v2f16 + + // fminimum f16:x, f16:y -> + // extract_vector_elt (fminimum (v2f16 (scalar_to_vector x)) + // (v2f16 (scalar_to_vector y))), 0 + SDLoc SL(Op); + SDValue WideSrc0 = + DAG.getNode(ISD::SCALAR_TO_VECTOR, SL, MVT::v2f16, Op.getOperand(0)); + SDValue WideSrc1 = + DAG.getNode(ISD::SCALAR_TO_VECTOR, SL, MVT::v2f16, Op.getOperand(1)); + + SDValue Widened = + DAG.getNode(Op.getOpcode(), SL, MVT::v2f16, WideSrc0, WideSrc1); + + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::f16, Widened, + DAG.getConstant(0, SL, MVT::i32)); +} + SDValue SITargetLowering::lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const { bool IsStrict = Op.getOpcode() == ISD::STRICT_FLDEXP; EVT VT = Op.getValueType(); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index 1cd7f1b29e077..9b2c14862407a 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -146,6 +146,7 @@ class SITargetLowering final : public AMDGPUTargetLowering { /// Custom lowering for ISD::FP_ROUND for MVT::f16. SDValue lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFMINNUM_FMAXNUM(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerFMINIMUM_FMAXIMUM(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const; SDValue promoteUniformOpToI32(SDValue Op, DAGCombinerInfo &DCI) const; SDValue lowerMUL(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll index f0fa621e3b4bc..6724c37605eb4 100644 --- a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll @@ -1251,19 +1251,27 @@ define half @v_fmaximum3_f16(half %a, half %b, half %c) { ; GFX12-NEXT: v_maximum3_f16 v0, v0, v1, v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fmaximum3_f16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f16_e32 v3, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX9-NEXT: v_max_f16_e32 v1, v0, v2 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fmaximum3_f16: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_max_f16_e32 v3, v0, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX942-NEXT: v_max_f16_e32 v1, v0, v2 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fmaximum3_f16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: s_setpc_b64 s[30:31] %max0 = call half @llvm.maximum.f16(half %a, half %b) %max1 = call half @llvm.maximum.f16(half %max0, half %c) ret half %max1 @@ -1280,19 +1288,27 @@ define half @v_fmaximum3_f16_commute(half %a, half %b, half %c) { ; GFX12-NEXT: v_maximum3_f16 v0, v2, v0, v1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fmaximum3_f16_commute: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f16_e32 v3, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX9-NEXT: v_max_f16_e32 v1, v2, v0 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v2, v0 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fmaximum3_f16_commute: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_max_f16_e32 v3, v0, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX942-NEXT: v_max_f16_e32 v1, v2, v0 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v2, v0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fmaximum3_f16_commute: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v2, v0, v0 +; GFX950-NEXT: s_setpc_b64 s[30:31] %max0 = call half @llvm.maximum.f16(half %a, half %b) %max1 = call half @llvm.maximum.f16(half %c, half %max0) ret half %max1 @@ -1309,22 +1325,34 @@ define amdgpu_ps i32 @s_fmaximum3_f16(half inreg %a, half inreg %b, half inreg % ; GFX12-NEXT: v_readfirstlane_b32 s0, v0 ; GFX12-NEXT: ; return to shader part epilog ; -; GFX9-LABEL: s_fmaximum3_f16: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: v_max_f16_e32 v1, s0, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s0, v0 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX9-NEXT: v_max_f16_e32 v1, s2, v0 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s2, v0 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_readfirstlane_b32 s0, v0 -; GFX9-NEXT: ; return to shader part epilog +; GFX942-LABEL: s_fmaximum3_f16: +; GFX942: ; %bb.0: +; GFX942-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NEXT: v_max_f16_e32 v1, s0, v0 +; GFX942-NEXT: v_mov_b32_e32 v2, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, s0, v0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX942-NEXT: v_max_f16_e32 v1, s2, v0 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, s2, v0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX942-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_readfirstlane_b32 s0, v0 +; GFX942-NEXT: ; return to shader part epilog +; +; GFX950-LABEL: s_fmaximum3_f16: +; GFX950: ; %bb.0: +; GFX950-NEXT: v_mov_b32_e32 v0, s0 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, s1, s1 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, s2, s2 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_readfirstlane_b32 s0, v0 +; GFX950-NEXT: ; return to shader part epilog %max0 = call half @llvm.maximum.f16(half %a, half %b) %max1 = call half @llvm.maximum.f16(half %max0, half %c) %cast = bitcast half %max1 to i16 @@ -1344,19 +1372,28 @@ define half @v_fmaximum3_f16_fabs0(half %a, half %b, half %c) { ; GFX12-NEXT: v_maximum3_f16 v0, |v0|, v1, v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fmaximum3_f16_fabs0: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f16_e64 v3, |v0|, v1 -; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e64 vcc, |v0|, v1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX9-NEXT: v_max_f16_e32 v1, v0, v2 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fmaximum3_f16_fabs0: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_max_f16_e64 v3, |v0|, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e64 vcc, |v0|, v1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX942-NEXT: v_max_f16_e32 v1, v0, v2 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fmaximum3_f16_fabs0: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: s_setpc_b64 s[30:31] %a.fabs = call half @llvm.fabs.f16(half %a) %max0 = call half @llvm.maximum.f16(half %a.fabs, half %b) %max1 = call half @llvm.maximum.f16(half %max0, half %c) @@ -1374,19 +1411,28 @@ define half @v_fmaximum3_f16_fabs1(half %a, half %b, half %c) { ; GFX12-NEXT: v_maximum3_f16 v0, v0, |v1|, v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fmaximum3_f16_fabs1: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f16_e64 v3, v0, |v1| -; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v0, |v1| -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX9-NEXT: v_max_f16_e32 v1, v0, v2 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fmaximum3_f16_fabs1: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_max_f16_e64 v3, v0, |v1| +; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v0, |v1| +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX942-NEXT: v_max_f16_e32 v1, v0, v2 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fmaximum3_f16_fabs1: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v1, 0x7fff, v1 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: s_setpc_b64 s[30:31] %b.fabs = call half @llvm.fabs.f16(half %b) %max0 = call half @llvm.maximum.f16(half %a, half %b.fabs) %max1 = call half @llvm.maximum.f16(half %max0, half %c) @@ -1404,19 +1450,28 @@ define half @v_fmaximum3_f16_fabs2(half %a, half %b, half %c) { ; GFX12-NEXT: v_maximum3_f16 v0, v0, v1, |v2| ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fmaximum3_f16_fabs2: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f16_e32 v3, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX9-NEXT: v_max_f16_e64 v1, v0, |v2| -; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v0, |v2| -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fmaximum3_f16_fabs2: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_max_f16_e32 v3, v0, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX942-NEXT: v_max_f16_e64 v1, v0, |v2| +; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v0, |v2| +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fmaximum3_f16_fabs2: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v2, 0x7fff, v2 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: s_setpc_b64 s[30:31] %c.fabs = call half @llvm.fabs.f16(half %c) %max0 = call half @llvm.maximum.f16(half %a, half %b) %max1 = call half @llvm.maximum.f16(half %max0, half %c.fabs) @@ -1434,19 +1489,30 @@ define half @v_fmaximum3_f16_fabs_all(half %a, half %b, half %c) { ; GFX12-NEXT: v_maximum3_f16 v0, |v0|, |v1|, |v2| ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fmaximum3_f16_fabs_all: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f16_e64 v3, |v0|, |v1| -; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e64 vcc, |v0|, |v1| -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX9-NEXT: v_max_f16_e64 v1, v0, |v2| -; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v0, |v2| -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fmaximum3_f16_fabs_all: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_max_f16_e64 v3, |v0|, |v1| +; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e64 vcc, |v0|, |v1| +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX942-NEXT: v_max_f16_e64 v1, v0, |v2| +; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v0, |v2| +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fmaximum3_f16_fabs_all: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; GFX950-NEXT: v_and_b32_e32 v1, 0x7fff, v1 +; GFX950-NEXT: v_and_b32_e32 v2, 0x7fff, v2 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: s_setpc_b64 s[30:31] %a.fabs = call half @llvm.fabs.f16(half %a) %b.fabs = call half @llvm.fabs.f16(half %b) %c.fabs = call half @llvm.fabs.f16(half %c) @@ -1466,19 +1532,30 @@ define half @v_fmaximum3_f16_fneg_all(half %a, half %b, half %c) { ; GFX12-NEXT: v_maximum3_f16 v0, -v0, -v1, -v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fmaximum3_f16_fneg_all: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f16_e64 v3, -v0, -v1 -; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e64 vcc, -v0, -v1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX9-NEXT: v_max_f16_e64 v1, v0, -v2 -; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v0, -v2 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fmaximum3_f16_fneg_all: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_max_f16_e64 v3, -v0, -v1 +; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e64 vcc, -v0, -v1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX942-NEXT: v_max_f16_e64 v1, v0, -v2 +; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v0, -v2 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fmaximum3_f16_fneg_all: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; GFX950-NEXT: v_xor_b32_e32 v1, 0x8000, v1 +; GFX950-NEXT: v_xor_b32_e32 v2, 0x8000, v2 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: s_setpc_b64 s[30:31] %a.fneg = fneg half %a %b.fneg = fneg half %b %c.fneg = fneg half %c @@ -1498,19 +1575,30 @@ define half @v_fmaximum3_f16_fneg_fabs_all(half %a, half %b, half %c) { ; GFX12-NEXT: v_maximum3_f16 v0, -|v0|, -|v1|, -|v2| ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fmaximum3_f16_fneg_fabs_all: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f16_e64 v3, -|v0|, -|v1| -; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e64 vcc, -|v0|, -|v1| -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX9-NEXT: v_max_f16_e64 v1, v0, -|v2| -; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v0, -|v2| -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fmaximum3_f16_fneg_fabs_all: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_max_f16_e64 v3, -|v0|, -|v1| +; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e64 vcc, -|v0|, -|v1| +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX942-NEXT: v_max_f16_e64 v1, v0, -|v2| +; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v0, -|v2| +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fmaximum3_f16_fneg_fabs_all: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_or_b32_e32 v0, 0x8000, v0 +; GFX950-NEXT: v_or_b32_e32 v1, 0x8000, v1 +; GFX950-NEXT: v_or_b32_e32 v2, 0x8000, v2 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: s_setpc_b64 s[30:31] %a.fabs = call half @llvm.fabs.f16(half %a) %b.fabs = call half @llvm.fabs.f16(half %b) %c.fabs = call half @llvm.fabs.f16(half %c) @@ -1533,19 +1621,28 @@ define half @v_fmaximum3_f16_fneg0(half %a, half %b, half %c) { ; GFX12-NEXT: v_maximum3_f16 v0, -v0, v1, v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fmaximum3_f16_fneg0: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f16_e64 v3, -v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e64 vcc, -v0, v1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX9-NEXT: v_max_f16_e32 v1, v0, v2 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fmaximum3_f16_fneg0: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_max_f16_e64 v3, -v0, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e64 vcc, -v0, v1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX942-NEXT: v_max_f16_e32 v1, v0, v2 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fmaximum3_f16_fneg0: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: s_setpc_b64 s[30:31] %a.fneg = fneg half %a %max0 = call half @llvm.maximum.f16(half %a.fneg, half %b) %max1 = call half @llvm.maximum.f16(half %max0, half %c) @@ -1563,19 +1660,28 @@ define half @v_fmaximum3_f16_fneg1(half %a, half %b, half %c) { ; GFX12-NEXT: v_maximum3_f16 v0, v0, -v1, v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fmaximum3_f16_fneg1: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f16_e64 v3, v0, -v1 -; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v0, -v1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX9-NEXT: v_max_f16_e32 v1, v0, v2 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fmaximum3_f16_fneg1: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_max_f16_e64 v3, v0, -v1 +; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v0, -v1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX942-NEXT: v_max_f16_e32 v1, v0, v2 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fmaximum3_f16_fneg1: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_xor_b32_e32 v1, 0x8000, v1 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: s_setpc_b64 s[30:31] %b.fneg = fneg half %b %max0 = call half @llvm.maximum.f16(half %a, half %b.fneg) %max1 = call half @llvm.maximum.f16(half %max0, half %c) @@ -1593,19 +1699,28 @@ define half @v_fmaximum3_f16_fneg2(half %a, half %b, half %c) { ; GFX12-NEXT: v_maximum3_f16 v0, v0, v1, -v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fmaximum3_f16_fneg2: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f16_e32 v3, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX9-NEXT: v_max_f16_e64 v1, v0, -v2 -; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v0, -v2 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fmaximum3_f16_fneg2: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_max_f16_e32 v3, v0, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX942-NEXT: v_max_f16_e64 v1, v0, -v2 +; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v0, -v2 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fmaximum3_f16_fneg2: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_xor_b32_e32 v2, 0x8000, v2 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: s_setpc_b64 s[30:31] %c.fneg = fneg half %c %max0 = call half @llvm.maximum.f16(half %a, half %b) %max1 = call half @llvm.maximum.f16(half %max0, half %c.fneg) @@ -1623,19 +1738,28 @@ define half @v_fmaximum3_f16_const0(half %b, half %c) { ; GFX12-NEXT: v_maximum3_f16 v0, v0, 0x4800, v1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fmaximum3_f16_const0: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f16_e32 v2, 0x4800, v0 -; GFX9-NEXT: v_mov_b32_e32 v3, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NEXT: v_max_f16_e32 v2, v0, v1 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fmaximum3_f16_const0: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_max_f16_e32 v2, 0x4800, v0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX942-NEXT: v_max_f16_e32 v2, v0, v1 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fmaximum3_f16_const0: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_movk_i32 s0, 0x4800 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, s0, s0 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1 +; GFX950-NEXT: s_setpc_b64 s[30:31] %max0 = call half @llvm.maximum.f16(half 8.0, half %b) %max1 = call half @llvm.maximum.f16(half %max0, half %c) ret half %max1 @@ -1652,19 +1776,27 @@ define half @v_fmaximum3_f16__const2(half %a, half %b) { ; GFX12-NEXT: v_maximum3_f16 v0, v0, v1, 0x4800 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fmaximum3_f16__const2: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f16_e32 v2, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NEXT: v_max_f16_e32 v1, 0x4800, v0 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fmaximum3_f16__const2: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_max_f16_e32 v2, v0, v1 +; GFX942-NEXT: v_mov_b32_e32 v3, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX942-NEXT: v_max_f16_e32 v1, 0x4800, v0 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fmaximum3_f16__const2: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1 +; GFX950-NEXT: s_movk_i32 s0, 0x4800 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, s0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] %max0 = call half @llvm.maximum.f16(half %a, half %b) %max1 = call half @llvm.maximum.f16(half %max0, half 8.0) ret half %max1 @@ -1681,19 +1813,27 @@ define half @v_fmaximum3_f16_inlineimm0(half %b, half %c) { ; GFX12-NEXT: v_maximum3_f16 v0, v0, 4.0, v1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fmaximum3_f16_inlineimm0: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f16_e32 v2, 4.0, v0 -; GFX9-NEXT: v_mov_b32_e32 v3, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NEXT: v_max_f16_e32 v2, v0, v1 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fmaximum3_f16_inlineimm0: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_max_f16_e32 v2, 4.0, v0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX942-NEXT: v_max_f16_e32 v2, v0, v1 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fmaximum3_f16_inlineimm0: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, 4.0, 4.0 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1 +; GFX950-NEXT: s_setpc_b64 s[30:31] %max0 = call half @llvm.maximum.f16(half 4.0, half %b) %max1 = call half @llvm.maximum.f16(half %max0, half %c) ret half %max1 @@ -1710,19 +1850,27 @@ define half @v_fmaximum3_f16__inlineimm(half %a, half %b) { ; GFX12-NEXT: v_maximum3_f16 v0, v0, v1, 4.0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fmaximum3_f16__inlineimm: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f16_e32 v2, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NEXT: v_max_f16_e32 v1, 4.0, v0 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fmaximum3_f16__inlineimm: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_max_f16_e32 v2, v0, v1 +; GFX942-NEXT: v_mov_b32_e32 v3, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX942-NEXT: v_max_f16_e32 v1, 4.0, v0 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fmaximum3_f16__inlineimm: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, 4.0, 4.0 +; GFX950-NEXT: s_setpc_b64 s[30:31] %max0 = call half @llvm.maximum.f16(half %a, half %b) %max1 = call half @llvm.maximum.f16(half %max0, half 4.0) ret half %max1 @@ -1741,19 +1889,28 @@ define half @v_fmaximum3_f16_const1_const2(half %a) { ; GFX12-NEXT: v_maximum3_f16 v0, v0, s0, 0x4c00 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fmaximum3_f16_const1_const2: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f16_e32 v1, 0x4800, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX9-NEXT: v_max_f16_e32 v1, 0x4c00, v0 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fmaximum3_f16_const1_const2: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_max_f16_e32 v1, 0x4800, v0 +; GFX942-NEXT: v_mov_b32_e32 v2, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX942-NEXT: v_max_f16_e32 v1, 0x4c00, v0 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fmaximum3_f16_const1_const2: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_movk_i32 s0, 0x4800 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, s0, s0 +; GFX950-NEXT: s_movk_i32 s0, 0x4c00 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, s0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] %max0 = call half @llvm.maximum.f16(half %a, half 8.0) %max1 = call half @llvm.maximum.f16(half %max0, half 16.0) ret half %max1 @@ -3620,20 +3777,30 @@ define <2 x half> @v_no_fmaximum3_f16__multi_use(half %a, half %b, half %c) { ; GFX12-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_no_fmaximum3_f16__multi_use: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f16_e32 v3, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX9-NEXT: v_max_f16_e32 v1, v0, v2 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_no_fmaximum3_f16__multi_use: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_max_f16_e32 v3, v0, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX942-NEXT: v_max_f16_e32 v1, v0, v2 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GFX942-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX942-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_no_fmaximum3_f16__multi_use: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pk_maximum3_f16 v1, v0, v2, v2 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX950-NEXT: s_setpc_b64 s[30:31] %max0 = call half @llvm.maximum.f16(half %a, half %b) %max1 = call half @llvm.maximum.f16(half %max0, half %c) %insert.0 = insertelement <2 x half> poison, half %max0, i32 0 @@ -3651,23 +3818,35 @@ define amdgpu_ps <2 x i32> @s_no_fmaximum3_f16__multi_use(half inreg %a, half in ; GFX12-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX12-NEXT: ; return to shader part epilog ; -; GFX9-LABEL: s_no_fmaximum3_f16__multi_use: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: v_max_f16_e32 v1, s0, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s0, v0 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX9-NEXT: v_max_f16_e32 v1, s2, v0 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s2, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-NEXT: v_readfirstlane_b32 s0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s1, v1 -; GFX9-NEXT: ; return to shader part epilog +; GFX942-LABEL: s_no_fmaximum3_f16__multi_use: +; GFX942: ; %bb.0: +; GFX942-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NEXT: v_max_f16_e32 v1, s0, v0 +; GFX942-NEXT: v_mov_b32_e32 v2, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, s0, v0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX942-NEXT: v_max_f16_e32 v1, s2, v0 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, s2, v0 +; GFX942-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX942-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX942-NEXT: v_readfirstlane_b32 s0, v0 +; GFX942-NEXT: v_readfirstlane_b32 s1, v1 +; GFX942-NEXT: ; return to shader part epilog +; +; GFX950-LABEL: s_no_fmaximum3_f16__multi_use: +; GFX950: ; %bb.0: +; GFX950-NEXT: v_mov_b32_e32 v0, s0 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, s1, s1 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pk_maximum3_f16 v1, v0, s2, s2 +; GFX950-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX950-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX950-NEXT: v_readfirstlane_b32 s0, v0 +; GFX950-NEXT: v_readfirstlane_b32 s1, v1 +; GFX950-NEXT: ; return to shader part epilog %max0 = call half @llvm.maximum.f16(half %a, half %b) %max1 = call half @llvm.maximum.f16(half %max0, half %c) %cast0 = bitcast half %max0 to i16 diff --git a/llvm/test/CodeGen/AMDGPU/fminimum3.ll b/llvm/test/CodeGen/AMDGPU/fminimum3.ll index 7a8a224c76a83..04d21548187a0 100644 --- a/llvm/test/CodeGen/AMDGPU/fminimum3.ll +++ b/llvm/test/CodeGen/AMDGPU/fminimum3.ll @@ -1251,19 +1251,27 @@ define half @v_fminimum3_f16(half %a, half %b, half %c) { ; GFX12-NEXT: v_minimum3_f16 v0, v0, v1, v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fminimum3_f16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_f16_e32 v3, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX9-NEXT: v_min_f16_e32 v1, v0, v2 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fminimum3_f16: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_min_f16_e32 v3, v0, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX942-NEXT: v_min_f16_e32 v1, v0, v2 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fminimum3_f16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: s_setpc_b64 s[30:31] %max0 = call half @llvm.minimum.f16(half %a, half %b) %max1 = call half @llvm.minimum.f16(half %max0, half %c) ret half %max1 @@ -1280,19 +1288,27 @@ define half @v_fminimum3_f16_commute(half %a, half %b, half %c) { ; GFX12-NEXT: v_minimum3_f16 v0, v2, v0, v1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fminimum3_f16_commute: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_f16_e32 v3, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX9-NEXT: v_min_f16_e32 v1, v2, v0 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v2, v0 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fminimum3_f16_commute: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_min_f16_e32 v3, v0, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX942-NEXT: v_min_f16_e32 v1, v2, v0 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v2, v0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fminimum3_f16_commute: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v2, v0, v0 +; GFX950-NEXT: s_setpc_b64 s[30:31] %max0 = call half @llvm.minimum.f16(half %a, half %b) %max1 = call half @llvm.minimum.f16(half %c, half %max0) ret half %max1 @@ -1309,22 +1325,34 @@ define amdgpu_ps i32 @s_fminimum3_f16(half inreg %a, half inreg %b, half inreg % ; GFX12-NEXT: v_readfirstlane_b32 s0, v0 ; GFX12-NEXT: ; return to shader part epilog ; -; GFX9-LABEL: s_fminimum3_f16: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: v_min_f16_e32 v1, s0, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s0, v0 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX9-NEXT: v_min_f16_e32 v1, s2, v0 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s2, v0 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_readfirstlane_b32 s0, v0 -; GFX9-NEXT: ; return to shader part epilog +; GFX942-LABEL: s_fminimum3_f16: +; GFX942: ; %bb.0: +; GFX942-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NEXT: v_min_f16_e32 v1, s0, v0 +; GFX942-NEXT: v_mov_b32_e32 v2, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, s0, v0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX942-NEXT: v_min_f16_e32 v1, s2, v0 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, s2, v0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX942-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_readfirstlane_b32 s0, v0 +; GFX942-NEXT: ; return to shader part epilog +; +; GFX950-LABEL: s_fminimum3_f16: +; GFX950: ; %bb.0: +; GFX950-NEXT: v_mov_b32_e32 v0, s0 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, s1, s1 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, s2, s2 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_readfirstlane_b32 s0, v0 +; GFX950-NEXT: ; return to shader part epilog %max0 = call half @llvm.minimum.f16(half %a, half %b) %max1 = call half @llvm.minimum.f16(half %max0, half %c) %cast = bitcast half %max1 to i16 @@ -1344,19 +1372,28 @@ define half @v_fminimum3_f16_fabs0(half %a, half %b, half %c) { ; GFX12-NEXT: v_minimum3_f16 v0, |v0|, v1, v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fminimum3_f16_fabs0: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_f16_e64 v3, |v0|, v1 -; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e64 vcc, |v0|, v1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX9-NEXT: v_min_f16_e32 v1, v0, v2 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fminimum3_f16_fabs0: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_min_f16_e64 v3, |v0|, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e64 vcc, |v0|, v1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX942-NEXT: v_min_f16_e32 v1, v0, v2 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fminimum3_f16_fabs0: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: s_setpc_b64 s[30:31] %a.fabs = call half @llvm.fabs.f16(half %a) %max0 = call half @llvm.minimum.f16(half %a.fabs, half %b) %max1 = call half @llvm.minimum.f16(half %max0, half %c) @@ -1374,19 +1411,28 @@ define half @v_fminimum3_f16_fabs1(half %a, half %b, half %c) { ; GFX12-NEXT: v_minimum3_f16 v0, v0, |v1|, v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fminimum3_f16_fabs1: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_f16_e64 v3, v0, |v1| -; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v0, |v1| -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX9-NEXT: v_min_f16_e32 v1, v0, v2 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fminimum3_f16_fabs1: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_min_f16_e64 v3, v0, |v1| +; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v0, |v1| +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX942-NEXT: v_min_f16_e32 v1, v0, v2 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fminimum3_f16_fabs1: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v1, 0x7fff, v1 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: s_setpc_b64 s[30:31] %b.fabs = call half @llvm.fabs.f16(half %b) %max0 = call half @llvm.minimum.f16(half %a, half %b.fabs) %max1 = call half @llvm.minimum.f16(half %max0, half %c) @@ -1404,19 +1450,28 @@ define half @v_fminimum3_f16_fabs2(half %a, half %b, half %c) { ; GFX12-NEXT: v_minimum3_f16 v0, v0, v1, |v2| ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fminimum3_f16_fabs2: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_f16_e32 v3, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX9-NEXT: v_min_f16_e64 v1, v0, |v2| -; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v0, |v2| -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fminimum3_f16_fabs2: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_min_f16_e32 v3, v0, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX942-NEXT: v_min_f16_e64 v1, v0, |v2| +; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v0, |v2| +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fminimum3_f16_fabs2: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v2, 0x7fff, v2 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: s_setpc_b64 s[30:31] %c.fabs = call half @llvm.fabs.f16(half %c) %max0 = call half @llvm.minimum.f16(half %a, half %b) %max1 = call half @llvm.minimum.f16(half %max0, half %c.fabs) @@ -1434,19 +1489,30 @@ define half @v_fminimum3_f16_fabs_all(half %a, half %b, half %c) { ; GFX12-NEXT: v_minimum3_f16 v0, |v0|, |v1|, |v2| ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fminimum3_f16_fabs_all: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_f16_e64 v3, |v0|, |v1| -; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e64 vcc, |v0|, |v1| -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX9-NEXT: v_min_f16_e64 v1, v0, |v2| -; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v0, |v2| -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fminimum3_f16_fabs_all: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_min_f16_e64 v3, |v0|, |v1| +; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e64 vcc, |v0|, |v1| +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX942-NEXT: v_min_f16_e64 v1, v0, |v2| +; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v0, |v2| +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fminimum3_f16_fabs_all: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; GFX950-NEXT: v_and_b32_e32 v1, 0x7fff, v1 +; GFX950-NEXT: v_and_b32_e32 v2, 0x7fff, v2 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: s_setpc_b64 s[30:31] %a.fabs = call half @llvm.fabs.f16(half %a) %b.fabs = call half @llvm.fabs.f16(half %b) %c.fabs = call half @llvm.fabs.f16(half %c) @@ -1466,19 +1532,30 @@ define half @v_fminimum3_f16_fneg_all(half %a, half %b, half %c) { ; GFX12-NEXT: v_minimum3_f16 v0, -v0, -v1, -v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fminimum3_f16_fneg_all: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_f16_e64 v3, -v0, -v1 -; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e64 vcc, -v0, -v1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX9-NEXT: v_min_f16_e64 v1, v0, -v2 -; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v0, -v2 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fminimum3_f16_fneg_all: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_min_f16_e64 v3, -v0, -v1 +; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e64 vcc, -v0, -v1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX942-NEXT: v_min_f16_e64 v1, v0, -v2 +; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v0, -v2 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fminimum3_f16_fneg_all: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; GFX950-NEXT: v_xor_b32_e32 v1, 0x8000, v1 +; GFX950-NEXT: v_xor_b32_e32 v2, 0x8000, v2 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: s_setpc_b64 s[30:31] %a.fneg = fneg half %a %b.fneg = fneg half %b %c.fneg = fneg half %c @@ -1498,19 +1575,30 @@ define half @v_fminimum3_f16_fneg_fabs_all(half %a, half %b, half %c) { ; GFX12-NEXT: v_minimum3_f16 v0, -|v0|, -|v1|, -|v2| ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fminimum3_f16_fneg_fabs_all: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_f16_e64 v3, -|v0|, -|v1| -; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e64 vcc, -|v0|, -|v1| -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX9-NEXT: v_min_f16_e64 v1, v0, -|v2| -; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v0, -|v2| -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fminimum3_f16_fneg_fabs_all: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_min_f16_e64 v3, -|v0|, -|v1| +; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e64 vcc, -|v0|, -|v1| +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX942-NEXT: v_min_f16_e64 v1, v0, -|v2| +; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v0, -|v2| +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fminimum3_f16_fneg_fabs_all: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_or_b32_e32 v0, 0x8000, v0 +; GFX950-NEXT: v_or_b32_e32 v1, 0x8000, v1 +; GFX950-NEXT: v_or_b32_e32 v2, 0x8000, v2 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: s_setpc_b64 s[30:31] %a.fabs = call half @llvm.fabs.f16(half %a) %b.fabs = call half @llvm.fabs.f16(half %b) %c.fabs = call half @llvm.fabs.f16(half %c) @@ -1533,19 +1621,28 @@ define half @v_fminimum3_f16_fneg0(half %a, half %b, half %c) { ; GFX12-NEXT: v_minimum3_f16 v0, -v0, v1, v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fminimum3_f16_fneg0: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_f16_e64 v3, -v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e64 vcc, -v0, v1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX9-NEXT: v_min_f16_e32 v1, v0, v2 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fminimum3_f16_fneg0: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_min_f16_e64 v3, -v0, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e64 vcc, -v0, v1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX942-NEXT: v_min_f16_e32 v1, v0, v2 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fminimum3_f16_fneg0: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: s_setpc_b64 s[30:31] %a.fneg = fneg half %a %max0 = call half @llvm.minimum.f16(half %a.fneg, half %b) %max1 = call half @llvm.minimum.f16(half %max0, half %c) @@ -1563,19 +1660,28 @@ define half @v_fminimum3_f16_fneg1(half %a, half %b, half %c) { ; GFX12-NEXT: v_minimum3_f16 v0, v0, -v1, v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fminimum3_f16_fneg1: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_f16_e64 v3, v0, -v1 -; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v0, -v1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX9-NEXT: v_min_f16_e32 v1, v0, v2 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fminimum3_f16_fneg1: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_min_f16_e64 v3, v0, -v1 +; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v0, -v1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX942-NEXT: v_min_f16_e32 v1, v0, v2 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fminimum3_f16_fneg1: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_xor_b32_e32 v1, 0x8000, v1 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: s_setpc_b64 s[30:31] %b.fneg = fneg half %b %max0 = call half @llvm.minimum.f16(half %a, half %b.fneg) %max1 = call half @llvm.minimum.f16(half %max0, half %c) @@ -1593,19 +1699,28 @@ define half @v_fminimum3_f16_fneg2(half %a, half %b, half %c) { ; GFX12-NEXT: v_minimum3_f16 v0, v0, v1, -v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fminimum3_f16_fneg2: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_f16_e32 v3, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX9-NEXT: v_min_f16_e64 v1, v0, -v2 -; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v0, -v2 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fminimum3_f16_fneg2: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_min_f16_e32 v3, v0, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX942-NEXT: v_min_f16_e64 v1, v0, -v2 +; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v0, -v2 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fminimum3_f16_fneg2: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_xor_b32_e32 v2, 0x8000, v2 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: s_setpc_b64 s[30:31] %c.fneg = fneg half %c %max0 = call half @llvm.minimum.f16(half %a, half %b) %max1 = call half @llvm.minimum.f16(half %max0, half %c.fneg) @@ -1623,19 +1738,28 @@ define half @v_fminimum3_f16_const0(half %b, half %c) { ; GFX12-NEXT: v_minimum3_f16 v0, v0, 0x4800, v1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fminimum3_f16_const0: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_f16_e32 v2, 0x4800, v0 -; GFX9-NEXT: v_mov_b32_e32 v3, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NEXT: v_min_f16_e32 v2, v0, v1 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fminimum3_f16_const0: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_min_f16_e32 v2, 0x4800, v0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX942-NEXT: v_min_f16_e32 v2, v0, v1 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fminimum3_f16_const0: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_movk_i32 s0, 0x4800 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, s0, s0 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1 +; GFX950-NEXT: s_setpc_b64 s[30:31] %max0 = call half @llvm.minimum.f16(half 8.0, half %b) %max1 = call half @llvm.minimum.f16(half %max0, half %c) ret half %max1 @@ -1652,19 +1776,27 @@ define half @v_fminimum3_f16__const2(half %a, half %b) { ; GFX12-NEXT: v_minimum3_f16 v0, v0, v1, 0x4800 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fminimum3_f16__const2: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_f16_e32 v2, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NEXT: v_min_f16_e32 v1, 0x4800, v0 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fminimum3_f16__const2: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_min_f16_e32 v2, v0, v1 +; GFX942-NEXT: v_mov_b32_e32 v3, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX942-NEXT: v_min_f16_e32 v1, 0x4800, v0 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fminimum3_f16__const2: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1 +; GFX950-NEXT: s_movk_i32 s0, 0x4800 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, s0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] %max0 = call half @llvm.minimum.f16(half %a, half %b) %max1 = call half @llvm.minimum.f16(half %max0, half 8.0) ret half %max1 @@ -1681,19 +1813,27 @@ define half @v_fminimum3_f16_inlineimm0(half %b, half %c) { ; GFX12-NEXT: v_minimum3_f16 v0, v0, 4.0, v1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fminimum3_f16_inlineimm0: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_f16_e32 v2, 4.0, v0 -; GFX9-NEXT: v_mov_b32_e32 v3, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NEXT: v_min_f16_e32 v2, v0, v1 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fminimum3_f16_inlineimm0: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_min_f16_e32 v2, 4.0, v0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX942-NEXT: v_min_f16_e32 v2, v0, v1 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fminimum3_f16_inlineimm0: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, 4.0, 4.0 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1 +; GFX950-NEXT: s_setpc_b64 s[30:31] %max0 = call half @llvm.minimum.f16(half 4.0, half %b) %max1 = call half @llvm.minimum.f16(half %max0, half %c) ret half %max1 @@ -1710,19 +1850,27 @@ define half @v_fminimum3_f16__inlineimm(half %a, half %b) { ; GFX12-NEXT: v_minimum3_f16 v0, v0, v1, 4.0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fminimum3_f16__inlineimm: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_f16_e32 v2, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NEXT: v_min_f16_e32 v1, 4.0, v0 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fminimum3_f16__inlineimm: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_min_f16_e32 v2, v0, v1 +; GFX942-NEXT: v_mov_b32_e32 v3, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX942-NEXT: v_min_f16_e32 v1, 4.0, v0 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fminimum3_f16__inlineimm: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, 4.0, 4.0 +; GFX950-NEXT: s_setpc_b64 s[30:31] %max0 = call half @llvm.minimum.f16(half %a, half %b) %max1 = call half @llvm.minimum.f16(half %max0, half 4.0) ret half %max1 @@ -1741,19 +1889,28 @@ define half @v_fminimum3_f16_const1_const2(half %a) { ; GFX12-NEXT: v_minimum3_f16 v0, v0, s0, 0x4c00 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fminimum3_f16_const1_const2: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_f16_e32 v1, 0x4800, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX9-NEXT: v_min_f16_e32 v1, 0x4c00, v0 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_fminimum3_f16_const1_const2: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_min_f16_e32 v1, 0x4800, v0 +; GFX942-NEXT: v_mov_b32_e32 v2, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX942-NEXT: v_min_f16_e32 v1, 0x4c00, v0 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fminimum3_f16_const1_const2: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_movk_i32 s0, 0x4800 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, s0, s0 +; GFX950-NEXT: s_movk_i32 s0, 0x4c00 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, s0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] %max0 = call half @llvm.minimum.f16(half %a, half 8.0) %max1 = call half @llvm.minimum.f16(half %max0, half 16.0) ret half %max1 @@ -3620,20 +3777,30 @@ define <2 x half> @v_no_fminimum3_f16__multi_use(half %a, half %b, half %c) { ; GFX12-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_no_fminimum3_f16__multi_use: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_f16_e32 v3, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX9-NEXT: v_min_f16_e32 v1, v0, v2 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: v_no_fminimum3_f16__multi_use: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_min_f16_e32 v3, v0, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX942-NEXT: v_min_f16_e32 v1, v0, v2 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GFX942-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX942-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_no_fminimum3_f16__multi_use: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pk_minimum3_f16 v1, v0, v2, v2 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX950-NEXT: s_setpc_b64 s[30:31] %max0 = call half @llvm.minimum.f16(half %a, half %b) %max1 = call half @llvm.minimum.f16(half %max0, half %c) %insert.0 = insertelement <2 x half> poison, half %max0, i32 0 @@ -3651,23 +3818,35 @@ define amdgpu_ps <2 x i32> @s_no_fminimum3_f16__multi_use(half inreg %a, half in ; GFX12-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX12-NEXT: ; return to shader part epilog ; -; GFX9-LABEL: s_no_fminimum3_f16__multi_use: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: v_min_f16_e32 v1, s0, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s0, v0 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX9-NEXT: v_min_f16_e32 v1, s2, v0 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s2, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-NEXT: v_readfirstlane_b32 s0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s1, v1 -; GFX9-NEXT: ; return to shader part epilog +; GFX942-LABEL: s_no_fminimum3_f16__multi_use: +; GFX942: ; %bb.0: +; GFX942-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NEXT: v_min_f16_e32 v1, s0, v0 +; GFX942-NEXT: v_mov_b32_e32 v2, 0x7e00 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, s0, v0 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX942-NEXT: v_min_f16_e32 v1, s2, v0 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, s2, v0 +; GFX942-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX942-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX942-NEXT: v_readfirstlane_b32 s0, v0 +; GFX942-NEXT: v_readfirstlane_b32 s1, v1 +; GFX942-NEXT: ; return to shader part epilog +; +; GFX950-LABEL: s_no_fminimum3_f16__multi_use: +; GFX950: ; %bb.0: +; GFX950-NEXT: v_mov_b32_e32 v0, s0 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, s1, s1 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pk_minimum3_f16 v1, v0, s2, s2 +; GFX950-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX950-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX950-NEXT: v_readfirstlane_b32 s0, v0 +; GFX950-NEXT: v_readfirstlane_b32 s1, v1 +; GFX950-NEXT: ; return to shader part epilog %max0 = call half @llvm.minimum.f16(half %a, half %b) %max1 = call half @llvm.minimum.f16(half %max0, half %c) %cast0 = bitcast half %max0 to i16 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll index 4532571d5cf2a..e828a12442fb8 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll @@ -42,11 +42,7 @@ define half @v_maximum_f16(half %src0, half %src1) { ; GFX950-LABEL: v_maximum_f16: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_max_f16_e32 v2, v0, v1 -; GFX950-NEXT: v_mov_b32_e32 v3, 0x7e00 -; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1 ; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_f16: @@ -96,11 +92,17 @@ define half @v_maximum_f16__nnan(half %src0, half %src1) { ; GFX8-NEXT: v_max_f16_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maximum_f16__nnan: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f16_e32 v0, v0, v1 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maximum_f16__nnan: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_max_f16_e32 v0, v0, v1 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maximum_f16__nnan: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_f16__nnan: ; GFX10: ; %bb.0: @@ -162,11 +164,7 @@ define half @v_maximum_f16__nsz(half %src0, half %src1) { ; GFX950-LABEL: v_maximum_f16__nsz: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_max_f16_e32 v2, v0, v1 -; GFX950-NEXT: v_mov_b32_e32 v3, 0x7e00 -; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1 ; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_f16__nsz: @@ -216,11 +214,17 @@ define half @v_maximum_f16__nnan_nsz(half %src0, half %src1) { ; GFX8-NEXT: v_max_f16_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maximum_f16__nnan_nsz: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f16_e32 v0, v0, v1 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maximum_f16__nnan_nsz: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_max_f16_e32 v0, v0, v1 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maximum_f16__nnan_nsz: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_f16__nnan_nsz: ; GFX10: ; %bb.0: @@ -286,11 +290,7 @@ define half @v_maximum_f16__nnan_src0(half %arg0, half %src1) { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_add_f16_e32 v0, 1.0, v0 -; GFX950-NEXT: v_max_f16_e32 v2, v0, v1 -; GFX950-NEXT: v_mov_b32_e32 v3, 0x7e00 -; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1 ; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_f16__nnan_src0: @@ -367,11 +367,7 @@ define half @v_maximum_f16__nnan_src1(half %src0, half %arg1) { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_add_f16_e32 v1, 1.0, v1 -; GFX950-NEXT: v_max_f16_e32 v2, v0, v1 -; GFX950-NEXT: v_mov_b32_e32 v3, 0x7e00 -; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1 ; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_f16__nnan_src1: @@ -458,12 +454,9 @@ define void @s_maximum_f16(half inreg %src0, half inreg %src1) { ; GFX950-LABEL: s_maximum_f16: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v0, s1 -; GFX950-NEXT: v_max_f16_e32 v1, s0, v0 -; GFX950-NEXT: v_mov_b32_e32 v2, 0x7e00 -; GFX950-NEXT: v_cmp_o_f16_e32 vcc, s0, v0 -; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX950-NEXT: v_mov_b32_e32 v0, s0 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, s1, s1 +; GFX950-NEXT: s_nop 0 ; GFX950-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use v0 @@ -2505,3 +2498,4 @@ define <16 x half> @v_maximum_v16f16(<16 x half> %src0, <16 x half> %src1) { } ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; GCN: {{.*}} +; GFX9: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll index 0b9cb9682ea5f..9a2ef15737308 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll @@ -30,11 +30,7 @@ define half @v_minimum_f16(half %src0, half %src1) { ; GFX950-LABEL: v_minimum_f16: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_min_f16_e32 v2, v0, v1 -; GFX950-NEXT: v_mov_b32_e32 v3, 0x7e00 -; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1 ; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_f16: @@ -74,11 +70,17 @@ define half @v_minimum_f16__nnan(half %src0, half %src1) { ; GFX8-NEXT: v_min_f16_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minimum_f16__nnan: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_f16_e32 v0, v0, v1 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minimum_f16__nnan: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_min_f16_e32 v0, v0, v1 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minimum_f16__nnan: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_f16__nnan: ; GFX10: ; %bb.0: @@ -127,11 +129,7 @@ define half @v_minimum_f16__nsz(half %src0, half %src1) { ; GFX950-LABEL: v_minimum_f16__nsz: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_min_f16_e32 v2, v0, v1 -; GFX950-NEXT: v_mov_b32_e32 v3, 0x7e00 -; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1 ; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_f16__nsz: @@ -171,11 +169,17 @@ define half @v_minimum_f16__nnan_nsz(half %src0, half %src1) { ; GFX8-NEXT: v_min_f16_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minimum_f16__nnan_nsz: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_f16_e32 v0, v0, v1 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minimum_f16__nnan_nsz: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_min_f16_e32 v0, v0, v1 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minimum_f16__nnan_nsz: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_f16__nnan_nsz: ; GFX10: ; %bb.0: @@ -227,11 +231,7 @@ define half @v_minimum_f16__nnan_src0(half %arg0, half %src1) { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_add_f16_e32 v0, 1.0, v0 -; GFX950-NEXT: v_min_f16_e32 v2, v0, v1 -; GFX950-NEXT: v_mov_b32_e32 v3, 0x7e00 -; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1 ; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_f16__nnan_src0: @@ -294,11 +294,7 @@ define half @v_minimum_f16__nnan_src1(half %src0, half %arg1) { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_add_f16_e32 v1, 1.0, v1 -; GFX950-NEXT: v_min_f16_e32 v2, v0, v1 -; GFX950-NEXT: v_mov_b32_e32 v3, 0x7e00 -; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1 ; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_f16__nnan_src1: @@ -368,12 +364,9 @@ define void @s_minimum_f16(half inreg %src0, half inreg %src1) { ; GFX950-LABEL: s_minimum_f16: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v0, s1 -; GFX950-NEXT: v_min_f16_e32 v1, s0, v0 -; GFX950-NEXT: v_mov_b32_e32 v2, 0x7e00 -; GFX950-NEXT: v_cmp_o_f16_e32 vcc, s0, v0 -; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX950-NEXT: v_mov_b32_e32 v0, s0 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, s1, s1 +; GFX950-NEXT: s_nop 0 ; GFX950-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use v0 @@ -1924,3 +1917,4 @@ define <16 x half> @v_minimum_v16f16(<16 x half> %src0, <16 x half> %src1) { } ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; GCN: {{.*}} +; GFX9: {{.*}} _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits