https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/74558
>From cdafeff37cd20e8cb8cdcf6ac8561455d5c9a30a Mon Sep 17 00:00:00 2001 From: Matt Arsenault <matthew.arsena...@amd.com> Date: Sat, 2 Dec 2023 20:49:51 +0700 Subject: [PATCH 1/2] DAG: Fix ABI lowering with FP promote in strictfp functions This was emitting non-strict casts in ABI contexts for illegal types. --- .../SelectionDAG/SelectionDAGBuilder.cpp | 59 +- .../CodeGen/AMDGPU/llvm.is.fpclass.f16.ll | 3 + llvm/test/CodeGen/AMDGPU/strict_fp_casts.ll | 110 ---- llvm/test/CodeGen/AMDGPU/strict_fpext.ll | 280 ++++++++- llvm/test/CodeGen/AMDGPU/strict_fptrunc.ll | 248 +++++++- .../AMDGPU/strictfp_f16_abi_promote.ll | 558 ++++++++++++++++++ 6 files changed, 1079 insertions(+), 179 deletions(-) delete mode 100644 llvm/test/CodeGen/AMDGPU/strict_fp_casts.ll create mode 100644 llvm/test/CodeGen/AMDGPU/strictfp_f16_abi_promote.ll diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 78ebd2d33459a7..2f4f29f4e045ce 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -153,6 +153,7 @@ static const unsigned MaxParallelChains = 64; static SDValue getCopyFromPartsVector(SelectionDAG &DAG, const SDLoc &DL, const SDValue *Parts, unsigned NumParts, MVT PartVT, EVT ValueVT, const Value *V, + SDValue InChain, std::optional<CallingConv::ID> CC); /// getCopyFromParts - Create a value that contains the specified legal parts @@ -163,6 +164,7 @@ static SDValue getCopyFromPartsVector(SelectionDAG &DAG, const SDLoc &DL, static SDValue getCopyFromParts(SelectionDAG &DAG, const SDLoc &DL, const SDValue *Parts, unsigned NumParts, MVT PartVT, EVT ValueVT, const Value *V, + SDValue InChain, std::optional<CallingConv::ID> CC = std::nullopt, std::optional<ISD::NodeType> AssertOp = std::nullopt) { // Let the target assemble the parts if it wants to @@ -173,7 +175,7 @@ getCopyFromParts(SelectionDAG &DAG, const SDLoc &DL, const SDValue *Parts, if (ValueVT.isVector()) return getCopyFromPartsVector(DAG, DL, Parts, NumParts, PartVT, ValueVT, V, - CC); + InChain, CC); assert(NumParts > 0 && "No parts to assemble!"); SDValue Val = Parts[0]; @@ -194,10 +196,10 @@ getCopyFromParts(SelectionDAG &DAG, const SDLoc &DL, const SDValue *Parts, EVT HalfVT = EVT::getIntegerVT(*DAG.getContext(), RoundBits/2); if (RoundParts > 2) { - Lo = getCopyFromParts(DAG, DL, Parts, RoundParts / 2, - PartVT, HalfVT, V); - Hi = getCopyFromParts(DAG, DL, Parts + RoundParts / 2, - RoundParts / 2, PartVT, HalfVT, V); + Lo = getCopyFromParts(DAG, DL, Parts, RoundParts / 2, PartVT, HalfVT, V, + InChain); + Hi = getCopyFromParts(DAG, DL, Parts + RoundParts / 2, RoundParts / 2, + PartVT, HalfVT, V, InChain); } else { Lo = DAG.getNode(ISD::BITCAST, DL, HalfVT, Parts[0]); Hi = DAG.getNode(ISD::BITCAST, DL, HalfVT, Parts[1]); @@ -213,7 +215,7 @@ getCopyFromParts(SelectionDAG &DAG, const SDLoc &DL, const SDValue *Parts, unsigned OddParts = NumParts - RoundParts; EVT OddVT = EVT::getIntegerVT(*DAG.getContext(), OddParts * PartBits); Hi = getCopyFromParts(DAG, DL, Parts + RoundParts, OddParts, PartVT, - OddVT, V, CC); + OddVT, V, InChain, CC); // Combine the round and odd parts. Lo = Val; @@ -243,7 +245,8 @@ getCopyFromParts(SelectionDAG &DAG, const SDLoc &DL, const SDValue *Parts, assert(ValueVT.isFloatingPoint() && PartVT.isInteger() && !PartVT.isVector() && "Unexpected split"); EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), ValueVT.getSizeInBits()); - Val = getCopyFromParts(DAG, DL, Parts, NumParts, PartVT, IntVT, V, CC); + Val = getCopyFromParts(DAG, DL, Parts, NumParts, PartVT, IntVT, V, + InChain, CC); } } @@ -283,10 +286,20 @@ getCopyFromParts(SelectionDAG &DAG, const SDLoc &DL, const SDValue *Parts, if (PartEVT.isFloatingPoint() && ValueVT.isFloatingPoint()) { // FP_ROUND's are always exact here. - if (ValueVT.bitsLT(Val.getValueType())) - return DAG.getNode( - ISD::FP_ROUND, DL, ValueVT, Val, - DAG.getTargetConstant(1, DL, TLI.getPointerTy(DAG.getDataLayout()))); + if (ValueVT.bitsLT(Val.getValueType())) { + + SDValue NoChange = + DAG.getTargetConstant(1, DL, TLI.getPointerTy(DAG.getDataLayout())); + + if (DAG.getMachineFunction().getFunction().getAttributes().hasFnAttr( + llvm::Attribute::StrictFP)) { + return DAG.getNode(ISD::STRICT_FP_ROUND, DL, + DAG.getVTList(ValueVT, MVT::Other), InChain, Val, + NoChange); + } + + return DAG.getNode(ISD::FP_ROUND, DL, ValueVT, Val, NoChange); + } return DAG.getNode(ISD::FP_EXTEND, DL, ValueVT, Val); } @@ -324,6 +337,7 @@ static void diagnosePossiblyInvalidConstraint(LLVMContext &Ctx, const Value *V, static SDValue getCopyFromPartsVector(SelectionDAG &DAG, const SDLoc &DL, const SDValue *Parts, unsigned NumParts, MVT PartVT, EVT ValueVT, const Value *V, + SDValue InChain, std::optional<CallingConv::ID> CallConv) { assert(ValueVT.isVector() && "Not a vector value"); assert(NumParts > 0 && "No parts to assemble!"); @@ -362,8 +376,8 @@ static SDValue getCopyFromPartsVector(SelectionDAG &DAG, const SDLoc &DL, // If the register was not expanded, truncate or copy the value, // as appropriate. for (unsigned i = 0; i != NumParts; ++i) - Ops[i] = getCopyFromParts(DAG, DL, &Parts[i], 1, - PartVT, IntermediateVT, V, CallConv); + Ops[i] = getCopyFromParts(DAG, DL, &Parts[i], 1, PartVT, IntermediateVT, + V, InChain, CallConv); } else if (NumParts > 0) { // If the intermediate type was expanded, build the intermediate // operands from the parts. @@ -371,8 +385,8 @@ static SDValue getCopyFromPartsVector(SelectionDAG &DAG, const SDLoc &DL, "Must expand into a divisible number of parts!"); unsigned Factor = NumParts / NumIntermediates; for (unsigned i = 0; i != NumIntermediates; ++i) - Ops[i] = getCopyFromParts(DAG, DL, &Parts[i * Factor], Factor, - PartVT, IntermediateVT, V, CallConv); + Ops[i] = getCopyFromParts(DAG, DL, &Parts[i * Factor], Factor, PartVT, + IntermediateVT, V, InChain, CallConv); } // Build a vector with BUILD_VECTOR or CONCAT_VECTORS from the @@ -926,7 +940,7 @@ SDValue RegsForValue::getCopyFromRegs(SelectionDAG &DAG, } Values[Value] = getCopyFromParts(DAG, dl, Parts.begin(), NumRegs, - RegisterVT, ValueVT, V, CallConv); + RegisterVT, ValueVT, V, Chain, CallConv); Part += NumRegs; Parts.clear(); } @@ -10635,9 +10649,9 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const { unsigned NumRegs = getNumRegistersForCallingConv(CLI.RetTy->getContext(), CLI.CallConv, VT); - ReturnValues.push_back(getCopyFromParts(CLI.DAG, CLI.DL, &InVals[CurReg], - NumRegs, RegisterVT, VT, nullptr, - CLI.CallConv, AssertOp)); + ReturnValues.push_back(getCopyFromParts( + CLI.DAG, CLI.DL, &InVals[CurReg], NumRegs, RegisterVT, VT, nullptr, + CLI.Chain, CLI.CallConv, AssertOp)); CurReg += NumRegs; } @@ -11116,8 +11130,9 @@ void SelectionDAGISel::LowerArguments(const Function &F) { MVT VT = ValueVTs[0].getSimpleVT(); MVT RegVT = TLI->getRegisterType(*CurDAG->getContext(), VT); std::optional<ISD::NodeType> AssertOp; - SDValue ArgValue = getCopyFromParts(DAG, dl, &InVals[0], 1, RegVT, VT, - nullptr, F.getCallingConv(), AssertOp); + SDValue ArgValue = + getCopyFromParts(DAG, dl, &InVals[0], 1, RegVT, VT, nullptr, NewRoot, + F.getCallingConv(), AssertOp); MachineFunction& MF = SDB->DAG.getMachineFunction(); MachineRegisterInfo& RegInfo = MF.getRegInfo(); @@ -11189,7 +11204,7 @@ void SelectionDAGISel::LowerArguments(const Function &F) { AssertOp = ISD::AssertZext; ArgValues.push_back(getCopyFromParts(DAG, dl, &InVals[i], NumParts, - PartVT, VT, nullptr, + PartVT, VT, nullptr, NewRoot, F.getCallingConv(), AssertOp)); } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll index d74948a460c98b..20d175ecfd9096 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll @@ -1316,6 +1316,9 @@ define i1 @isnan_f16_strictfp(half %x) strictfp nounwind { ; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c00 +; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7SELDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX7SELDAG-NEXT: v_cmp_lt_i32_e32 vcc, s4, v0 ; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc diff --git a/llvm/test/CodeGen/AMDGPU/strict_fp_casts.ll b/llvm/test/CodeGen/AMDGPU/strict_fp_casts.ll deleted file mode 100644 index 8a3647a9b6e939..00000000000000 --- a/llvm/test/CodeGen/AMDGPU/strict_fp_casts.ll +++ /dev/null @@ -1,110 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 -; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii < %s | FileCheck -check-prefixes=GFX7 %s - -declare float @llvm.experimental.constrained.fpext.f32.f16(half, metadata) #0 -declare <2 x float> @llvm.experimental.constrained.fpext.v2f32.v2f16(<2 x half>, metadata) #0 -declare half @llvm.experimental.constrained.fptrunc.f16.f32(float, metadata, metadata) #0 -declare <2 x half> @llvm.experimental.constrained.fptrunc.v2f16.v2f32(<2 x float>, metadata, metadata) #0 -declare float @llvm.fabs.f32(float) - -define float @v_constrained_fpext_f16_to_f32(ptr addrspace(1) %ptr) #0 { -; GFX7-LABEL: v_constrained_fpext_f16_to_f32: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: s_setpc_b64 s[30:31] - %val = load half, ptr addrspace(1) %ptr - %result = call float @llvm.experimental.constrained.fpext.f32.f16(half %val, metadata !"fpexcept.strict") - ret float %result -} - -define <2 x float> @v_constrained_fpext_v2f16_to_v2f32(ptr addrspace(1) %ptr) #0 { -; GFX7-LABEL: v_constrained_fpext_v2f16_to_v2f32: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v1, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: s_setpc_b64 s[30:31] - %val = load <2 x half>, ptr addrspace(1) %ptr - %result = call <2 x float> @llvm.experimental.constrained.fpext.v2f32.v2f16(<2 x half> %val, metadata !"fpexcept.strict") - ret <2 x float> %result -} - -define void @v_constrained_fptrunc_f32_to_f16_fpexcept_strict(float %arg, ptr addrspace(1) %ptr) #0 { -; GFX7-LABEL: v_constrained_fptrunc_f32_to_f16_fpexcept_strict: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: s_setpc_b64 s[30:31] - %result = call half @llvm.experimental.constrained.fptrunc.f16.f32(float %arg, metadata !"round.tonearest", metadata !"fpexcept.strict") - ret void -} - -define void @v_constrained_fptrunc_v2f32_to_v2f16_fpexcept_strict(<2 x float> %arg, ptr addrspace(1) %ptr) #0 { -; GFX7-LABEL: v_constrained_fptrunc_v2f32_to_v2f16_fpexcept_strict: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX7-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: s_setpc_b64 s[30:31] - %result = call <2 x half> @llvm.experimental.constrained.fptrunc.v2f16.v2f32(<2 x float> %arg, metadata !"round.tonearest", metadata !"fpexcept.strict") - store <2 x half> %result, ptr addrspace(1) %ptr - ret void -} - -define void @v_constrained_fptrunc_f32_to_f16_fpexcept_strict_fneg(float %arg, ptr addrspace(1) %ptr) #0 { -; GFX7-LABEL: v_constrained_fptrunc_f32_to_f16_fpexcept_strict_fneg: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e64 v0, -v0 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: s_setpc_b64 s[30:31] - %neg.arg = fneg float %arg - %result = call half @llvm.experimental.constrained.fptrunc.f16.f32(float %neg.arg, metadata !"round.tonearest", metadata !"fpexcept.strict") - ret void -} - -define void @v_constrained_fptrunc_f32_to_f16_fpexcept_strict_fabs(float %arg, ptr addrspace(1) %ptr) #0 { -; GFX7-LABEL: v_constrained_fptrunc_f32_to_f16_fpexcept_strict_fabs: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f16_f32_e64 v0, |v0| -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: s_setpc_b64 s[30:31] - %abs.arg = call float @llvm.fabs.f32(float %arg) - %result = call half @llvm.experimental.constrained.fptrunc.f16.f32(float %abs.arg, metadata !"round.tonearest", metadata !"fpexcept.strict") - ret void -} - -attributes #0 = { strictfp } diff --git a/llvm/test/CodeGen/AMDGPU/strict_fpext.ll b/llvm/test/CodeGen/AMDGPU/strict_fpext.ll index 22bebb7ad26f53..fe59a8491c91ab 100644 --- a/llvm/test/CodeGen/AMDGPU/strict_fpext.ll +++ b/llvm/test/CodeGen/AMDGPU/strict_fpext.ll @@ -1,22 +1,47 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 ; FIXME: Missing operand promote for f16 -; XUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,SI %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,SI %s ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx803 < %s | FileCheck -check-prefixes=GCN,GFX89,GFX8 %s ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX89,GFX9 %s ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GCN,GFX1011,GFX10 %s ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GCN,GFX1011,GFX11 %s define float @v_constrained_fpext_f16_to_f32_fpexcept_strict(half %arg) #0 { -; GCN-LABEL: v_constrained_fpext_f16_to_f32_fpexcept_strict: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: v_constrained_fpext_f16_to_f32_fpexcept_strict: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; GFX89-LABEL: v_constrained_fpext_f16_to_f32_fpexcept_strict: +; GFX89: ; %bb.0: +; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX89-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX1011-LABEL: v_constrained_fpext_f16_to_f32_fpexcept_strict: +; GFX1011: ; %bb.0: +; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1011-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX1011-NEXT: s_setpc_b64 s[30:31] %result = call float @llvm.experimental.constrained.fpext.f32.f16(half %arg, metadata !"fpexcept.strict") ret float %result } define <2 x float> @v_constrained_fpext_v2f16_to_v2f32_fpexcept_strict(<2 x half> %arg) #0 { +; SI-LABEL: v_constrained_fpext_v2f16_to_v2f32_fpexcept_strict: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_setpc_b64 s[30:31] +; ; GFX89-LABEL: v_constrained_fpext_v2f16_to_v2f32_fpexcept_strict: ; GFX89: ; %bb.0: ; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -45,6 +70,20 @@ define <2 x float> @v_constrained_fpext_v2f16_to_v2f32_fpexcept_strict(<2 x half } define <3 x float> @v_constrained_fpext_v3f16_to_v3f32_fpexcept_strict(<3 x half> %arg) #0 { +; SI-LABEL: v_constrained_fpext_v3f16_to_v3f32_fpexcept_strict: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_setpc_b64 s[30:31] +; ; GFX89-LABEL: v_constrained_fpext_v3f16_to_v3f32_fpexcept_strict: ; GFX89: ; %bb.0: ; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -101,6 +140,16 @@ define <2 x double> @v_constrained_fpext_v2f32_to_v2f64_fpexcept_strict(<2 x flo } define <3 x double> @v_constrained_fpext_v3f32_to_v3f64_fpexcept_strict(<3 x float> %arg) #0 { +; SI-LABEL: v_constrained_fpext_v3f32_to_v3f64_fpexcept_strict: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v2, v1 +; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; SI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 +; SI-NEXT: s_setpc_b64 s[30:31] +; ; GFX89-LABEL: v_constrained_fpext_v3f32_to_v3f64_fpexcept_strict: ; GFX89: ; %bb.0: ; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -125,17 +174,46 @@ define <3 x double> @v_constrained_fpext_v3f32_to_v3f64_fpexcept_strict(<3 x flo } define double @v_constrained_fpext_f16_to_f64_fpexcept_strict(half %arg) #0 { -; GCN-LABEL: v_constrained_fpext_f16_to_f64_fpexcept_strict: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: v_constrained_fpext_f16_to_f64_fpexcept_strict: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; GFX89-LABEL: v_constrained_fpext_f16_to_f64_fpexcept_strict: +; GFX89: ; %bb.0: +; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX89-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX89-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX1011-LABEL: v_constrained_fpext_f16_to_f64_fpexcept_strict: +; GFX1011: ; %bb.0: +; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1011-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX1011-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX1011-NEXT: s_setpc_b64 s[30:31] %result = call double @llvm.experimental.constrained.fpext.f64.f16(half %arg, metadata !"fpexcept.strict") ret double %result } define <2 x double> @v_constrained_fpext_v2f16_to_v2f64_fpexcept_strict(<2 x half> %arg) #0 { +; SI-LABEL: v_constrained_fpext_v2f16_to_v2f64_fpexcept_strict: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; SI-NEXT: s_setpc_b64 s[30:31] +; ; GFX89-LABEL: v_constrained_fpext_v2f16_to_v2f64_fpexcept_strict: ; GFX89: ; %bb.0: ; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -168,6 +246,23 @@ define <2 x double> @v_constrained_fpext_v2f16_to_v2f64_fpexcept_strict(<2 x hal } define <3 x double> @v_constrained_fpext_v3f16_to_v2f64_fpexcept_strict(<3 x half> %arg) #0 { +; SI-LABEL: v_constrained_fpext_v3f16_to_v2f64_fpexcept_strict: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 +; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v3 +; SI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 +; SI-NEXT: s_setpc_b64 s[30:31] +; ; GFX89-LABEL: v_constrained_fpext_v3f16_to_v2f64_fpexcept_strict: ; GFX89: ; %bb.0: ; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -206,23 +301,54 @@ define <3 x double> @v_constrained_fpext_v3f16_to_v2f64_fpexcept_strict(<3 x hal } define float @v_constrained_fneg_fpext_f16_to_f32_fpexcept_strict(half %arg) #0 { -; GCN-LABEL: v_constrained_fneg_fpext_f16_to_f32_fpexcept_strict: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: v_constrained_fneg_fpext_f16_to_f32_fpexcept_strict: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; GFX89-LABEL: v_constrained_fneg_fpext_f16_to_f32_fpexcept_strict: +; GFX89: ; %bb.0: +; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX89-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX89-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; GFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX1011-LABEL: v_constrained_fneg_fpext_f16_to_f32_fpexcept_strict: +; GFX1011: ; %bb.0: +; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1011-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX1011-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; GFX1011-NEXT: s_setpc_b64 s[30:31] %result = call float @llvm.experimental.constrained.fpext.f32.f16(half %arg, metadata !"fpexcept.strict") %neg.result = fneg float %result ret float %neg.result } define float @v_constrained_fpext_fneg_f16_to_f32_fpexcept_strict(half %arg) #0 { -; GCN-LABEL: v_constrained_fpext_fneg_f16_to_f32_fpexcept_strict: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e64 v0, -v0 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: v_constrained_fpext_fneg_f16_to_f32_fpexcept_strict: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; GFX89-LABEL: v_constrained_fpext_fneg_f16_to_f32_fpexcept_strict: +; GFX89: ; %bb.0: +; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX89-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; GFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX1011-LABEL: v_constrained_fpext_fneg_f16_to_f32_fpexcept_strict: +; GFX1011: ; %bb.0: +; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1011-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; GFX1011-NEXT: s_setpc_b64 s[30:31] %neg.arg = fneg half %arg %result = call float @llvm.experimental.constrained.fpext.f32.f16(half %neg.arg, metadata !"fpexcept.strict") ret float %result @@ -251,6 +377,111 @@ define double @v_constrained_fneg_fpext_f32_to_f64_fpexcept_strict(float %arg) # ret double %neg.result } +define float @v_constrained_fpext_f16_to_f32_noabi(ptr addrspace(1) %ptr) #0 { +; SI-LABEL: v_constrained_fpext_f16_to_f32_noabi: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_constrained_fpext_f16_to_f32_noabi: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_ushort v0, v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_constrained_fpext_f16_to_f32_noabi: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_ushort v0, v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constrained_fpext_f16_to_f32_noabi: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_ushort v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_constrained_fpext_f16_to_f32_noabi: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_u16 v0, v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %val = load half, ptr addrspace(1) %ptr + %result = call float @llvm.experimental.constrained.fpext.f32.f16(half %val, metadata !"fpexcept.strict") + ret float %result +} + +define <2 x float> @v_constrained_fpext_v2f16_to_v2f32_noabi(ptr addrspace(1) %ptr) #0 { +; SI-LABEL: v_constrained_fpext_v2f16_to_v2f32_noabi: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dword v1, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_constrained_fpext_v2f16_to_v2f32_noabi: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dword v1, v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cvt_f32_f16_e32 v0, v1 +; GFX8-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_constrained_fpext_v2f16_to_v2f32_noabi: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cvt_f32_f16_e32 v0, v1 +; GFX9-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constrained_fpext_v2f16_to_v2f32_noabi: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v1, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cvt_f32_f16_e32 v0, v1 +; GFX10-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_constrained_fpext_v2f16_to_v2f32_noabi: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %val = load <2 x half>, ptr addrspace(1) %ptr + %result = call <2 x float> @llvm.experimental.constrained.fpext.v2f32.v2f16(<2 x half> %val, metadata !"fpexcept.strict") + ret <2 x float> %result +} + declare double @llvm.experimental.constrained.fpext.f64.f32(float, metadata) #1 declare <2 x double> @llvm.experimental.constrained.fpext.v2f64.v2f32(<2 x float>, metadata) #1 declare <3 x double> @llvm.experimental.constrained.fpext.v3f64.v3f32(<3 x float>, metadata) #1 @@ -265,6 +496,3 @@ declare <3 x float> @llvm.experimental.constrained.fpext.v3f32.v3f16(<3 x half>, attributes #0 = { strictfp } attributes #1 = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: readwrite) } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GFX8: {{.*}} -; GFX9: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/strict_fptrunc.ll b/llvm/test/CodeGen/AMDGPU/strict_fptrunc.ll index ec2bc43ca36787..965040d0d879c8 100644 --- a/llvm/test/CodeGen/AMDGPU/strict_fptrunc.ll +++ b/llvm/test/CodeGen/AMDGPU/strict_fptrunc.ll @@ -1,21 +1,46 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 -; XUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,SI %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,SI %s ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx803 < %s | FileCheck -check-prefixes=GCN,GFX89,GFX8 %s ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX89,GFX9 %s ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GCN,GFX1011,GFX10 %s ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GCN,GFX1011,GFX11 %s define half @v_constrained_fptrunc_f32_to_f16_fpexcept_strict(float %arg) #0 { -; GCN-LABEL: v_constrained_fptrunc_f32_to_f16_fpexcept_strict: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: v_constrained_fptrunc_f32_to_f16_fpexcept_strict: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; GFX89-LABEL: v_constrained_fptrunc_f32_to_f16_fpexcept_strict: +; GFX89: ; %bb.0: +; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX89-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX1011-LABEL: v_constrained_fptrunc_f32_to_f16_fpexcept_strict: +; GFX1011: ; %bb.0: +; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1011-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX1011-NEXT: s_setpc_b64 s[30:31] %val = call half @llvm.experimental.constrained.fptrunc.f16.f32(float %arg, metadata !"round.tonearest", metadata !"fpexcept.strict") ret half %val } define <2 x half> @v_constrained_fptrunc_v2f32_to_v2f16_fpexcept_strict(<2 x float> %arg) #0 { +; SI-LABEL: v_constrained_fptrunc_v2f32_to_v2f16_fpexcept_strict: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_setpc_b64 s[30:31] +; ; GFX8-LABEL: v_constrained_fptrunc_v2f32_to_v2f16_fpexcept_strict: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -45,6 +70,20 @@ define <2 x half> @v_constrained_fptrunc_v2f32_to_v2f16_fpexcept_strict(<2 x flo } define <3 x half> @v_constrained_fptrunc_v3f32_to_v3f16_fpexcept_strict(<3 x float> %arg) #0 { +; SI-LABEL: v_constrained_fptrunc_v3f32_to_v3f16_fpexcept_strict: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_setpc_b64 s[30:31] +; ; GFX8-LABEL: v_constrained_fptrunc_v3f32_to_v3f16_fpexcept_strict: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -126,23 +165,53 @@ define <3 x float> @v_constrained_fptrunc_v3f64_to_v3f32_fpexcept_strict(<3 x do ; } define half @v_constrained_fneg_fptrunc_f32_to_f16_fpexcept_strict(float %arg) #0 { -; GCN-LABEL: v_constrained_fneg_fptrunc_f32_to_f16_fpexcept_strict: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_xor_b32_e32 v0, 0x8000, v0 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: v_constrained_fneg_fptrunc_f32_to_f16_fpexcept_strict: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; GFX89-LABEL: v_constrained_fneg_fptrunc_f32_to_f16_fpexcept_strict: +; GFX89: ; %bb.0: +; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX89-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX89-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; GFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX1011-LABEL: v_constrained_fneg_fptrunc_f32_to_f16_fpexcept_strict: +; GFX1011: ; %bb.0: +; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1011-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX1011-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; GFX1011-NEXT: s_setpc_b64 s[30:31] %val = call half @llvm.experimental.constrained.fptrunc.f16.f32(float %arg, metadata !"round.tonearest", metadata !"fpexcept.strict") %neg.val = fneg half %val ret half %neg.val } define half @v_constrained_fptrunc_fneg_f32_to_f16_fpexcept_strict(float %arg) #0 { -; GCN-LABEL: v_constrained_fptrunc_fneg_f32_to_f16_fpexcept_strict: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e64 v0, -v0 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: v_constrained_fptrunc_fneg_f32_to_f16_fpexcept_strict: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; GFX89-LABEL: v_constrained_fptrunc_fneg_f32_to_f16_fpexcept_strict: +; GFX89: ; %bb.0: +; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX89-NEXT: v_cvt_f16_f32_e64 v0, -v0 +; GFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX1011-LABEL: v_constrained_fptrunc_fneg_f32_to_f16_fpexcept_strict: +; GFX1011: ; %bb.0: +; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1011-NEXT: v_cvt_f16_f32_e64 v0, -v0 +; GFX1011-NEXT: s_setpc_b64 s[30:31] %neg.arg = fneg float %arg %val = call half @llvm.experimental.constrained.fptrunc.f16.f32(float %neg.arg, metadata !"round.tonearest", metadata !"fpexcept.strict") ret half %val @@ -171,6 +240,145 @@ define float @v_constrained_fptrunc_fneg_f64_to_f32_fpexcept_strict(double %arg) ret float %val } +define void @v_constrained_fptrunc_f32_to_f16_fpexcept_strict_noabi(float %arg, ptr addrspace(1) %ptr) #0 { +; SI-LABEL: v_constrained_fptrunc_f32_to_f16_fpexcept_strict_noabi: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; GFX89-LABEL: v_constrained_fptrunc_f32_to_f16_fpexcept_strict_noabi: +; GFX89: ; %bb.0: +; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX89-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX1011-LABEL: v_constrained_fptrunc_f32_to_f16_fpexcept_strict_noabi: +; GFX1011: ; %bb.0: +; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1011-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX1011-NEXT: s_setpc_b64 s[30:31] + %result = call half @llvm.experimental.constrained.fptrunc.f16.f32(float %arg, metadata !"round.tonearest", metadata !"fpexcept.strict") + ret void +} + +define void @v_constrained_fptrunc_v2f32_to_v2f16_fpexcept_strict_noabi(<2 x float> %arg, ptr addrspace(1) %ptr) #0 { +; SI-LABEL: v_constrained_fptrunc_v2f32_to_v2f16_fpexcept_strict_noabi: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_constrained_fptrunc_v2f32_to_v2f16_fpexcept_strict_noabi: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX8-NEXT: v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: flat_store_dword v[2:3], v0 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_constrained_fptrunc_v2f32_to_v2f16_fpexcept_strict_noabi: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX9-NEXT: global_store_dword v[2:3], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constrained_fptrunc_v2f32_to_v2f16_fpexcept_strict_noabi: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX10-NEXT: global_store_dword v[2:3], v0, off +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_constrained_fptrunc_v2f32_to_v2f16_fpexcept_strict_noabi: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NEXT: global_store_b32 v[2:3], v0, off +; GFX11-NEXT: s_setpc_b64 s[30:31] + %result = call <2 x half> @llvm.experimental.constrained.fptrunc.v2f16.v2f32(<2 x float> %arg, metadata !"round.tonearest", metadata !"fpexcept.strict") + store <2 x half> %result, ptr addrspace(1) %ptr + ret void +} + +define void @v_constrained_fptrunc_f32_to_f16_fpexcept_strict_noabi_fneg(float %arg, ptr addrspace(1) %ptr) #0 { +; SI-LABEL: v_constrained_fptrunc_f32_to_f16_fpexcept_strict_noabi_fneg: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; GFX89-LABEL: v_constrained_fptrunc_f32_to_f16_fpexcept_strict_noabi_fneg: +; GFX89: ; %bb.0: +; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX89-NEXT: v_cvt_f16_f32_e64 v0, -v0 +; GFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX1011-LABEL: v_constrained_fptrunc_f32_to_f16_fpexcept_strict_noabi_fneg: +; GFX1011: ; %bb.0: +; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1011-NEXT: v_cvt_f16_f32_e64 v0, -v0 +; GFX1011-NEXT: s_setpc_b64 s[30:31] + %neg.arg = fneg float %arg + %result = call half @llvm.experimental.constrained.fptrunc.f16.f32(float %neg.arg, metadata !"round.tonearest", metadata !"fpexcept.strict") + ret void +} + +define void @v_constrained_fptrunc_f32_to_f16_fpexcept_strict_noabi_fabs(float %arg, ptr addrspace(1) %ptr) #0 { +; SI-LABEL: v_constrained_fptrunc_f32_to_f16_fpexcept_strict_noabi_fabs: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e64 v0, |v0| +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; GFX89-LABEL: v_constrained_fptrunc_f32_to_f16_fpexcept_strict_noabi_fabs: +; GFX89: ; %bb.0: +; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX89-NEXT: v_cvt_f16_f32_e64 v0, |v0| +; GFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX1011-LABEL: v_constrained_fptrunc_f32_to_f16_fpexcept_strict_noabi_fabs: +; GFX1011: ; %bb.0: +; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1011-NEXT: v_cvt_f16_f32_e64 v0, |v0| +; GFX1011-NEXT: s_setpc_b64 s[30:31] + %abs.arg = call float @llvm.fabs.f32(float %arg) + %result = call half @llvm.experimental.constrained.fptrunc.f16.f32(float %abs.arg, metadata !"round.tonearest", metadata !"fpexcept.strict") + ret void +} + declare half @llvm.experimental.constrained.fptrunc.f16.f32(float, metadata, metadata) #1 declare <2 x half> @llvm.experimental.constrained.fptrunc.v2f16.v2f32(<2 x float>, metadata, metadata) #1 declare <3 x half> @llvm.experimental.constrained.fptrunc.v3f16.v3f32(<3 x float>, metadata, metadata) #1 @@ -183,9 +391,7 @@ declare half @llvm.experimental.constrained.fptrunc.f16.f64(double, metadata, me declare <2 x half> @llvm.experimental.constrained.fptrunc.v2f16.v2f64(<2 x double>, metadata, metadata) #1 declare <3 x half> @llvm.experimental.constrained.fptrunc.v3f16.v3f64(<3 x double>, metadata, metadata) #1 +declare float @llvm.fabs.f32(float) #1 + attributes #0 = { strictfp } attributes #1 = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: readwrite) } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GFX10: {{.*}} -; GFX11: {{.*}} -; GFX89: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/strictfp_f16_abi_promote.ll b/llvm/test/CodeGen/AMDGPU/strictfp_f16_abi_promote.ll new file mode 100644 index 00000000000000..57e4cec4eccb11 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/strictfp_f16_abi_promote.ll @@ -0,0 +1,558 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii < %s | FileCheck -check-prefix=GFX7 %s + +declare void @f16_user(half) +declare half @f16_result() + +declare void @v2f16_user(<2 x half>) +declare <2 x half> @v2f16_result() + +declare void @v4f16_user(<4 x half>) +declare <4 x half> @v4f16_result() + +declare void @v8f16_user(<8 x half>) +declare <8 x half> @v8f16_result() + +define void @f16_arg(half %arg, ptr %ptr) #0 { +; GFX7-LABEL: f16_arg: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: flat_store_dword v[1:2], v0 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] + %fpext = call float @llvm.experimental.constrained.fpext.f32.f16(half %arg, metadata !"fpexcept.strict") + store float %fpext, ptr %ptr + ret void +} + +define void @v2f16_arg(<2 x half> %arg, ptr %ptr) #0 { +; GFX7-LABEL: v2f16_arg: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 4, v2 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GFX7-NEXT: flat_store_dword v[0:1], v5 +; GFX7-NEXT: flat_store_dword v[2:3], v4 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] + %fpext = call <2 x float> @llvm.experimental.constrained.fpext.v2f32.v2f16(<2 x half> %arg, metadata !"fpexcept.strict") + store <2 x float> %fpext, ptr %ptr + ret void +} + +define void @v3f16_arg(<3 x half> %arg, ptr %ptr) #0 { +; GFX7-LABEL: v3f16_arg: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 8, v3 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v4, vcc +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 4, v3 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v4, vcc +; GFX7-NEXT: flat_store_dword v[0:1], v6 +; GFX7-NEXT: flat_store_dword v[3:4], v5 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] + %fpext = call <3 x float> @llvm.experimental.constrained.fpext.v3f32.v3f16(<3 x half> %arg, metadata !"fpexcept.strict") + store <3 x float> %fpext, ptr %ptr + ret void +} + +define void @v4f16_arg(<4 x half> %arg, ptr %ptr) #0 { +; GFX7-LABEL: v4f16_arg: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v3 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 12, v4 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 8, v4 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; GFX7-NEXT: flat_store_dword v[0:1], v7 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 4, v4 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; GFX7-NEXT: flat_store_dword v[0:1], v3 +; GFX7-NEXT: flat_store_dword v[4:5], v6 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] + %fpext = call <4 x float> @llvm.experimental.constrained.fpext.v4f32.v4f16(<4 x half> %arg, metadata !"fpexcept.strict") + store <4 x float> %fpext, ptr %ptr + ret void +} + +define half @f16_return(float %arg) #0 { +; GFX7-LABEL: f16_return: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %fptrunc = call half @llvm.experimental.constrained.fptrunc.f16.f32(float %arg, metadata !"round.tonearest", metadata !"fpexcept.strict") + ret half %fptrunc +} + +define <2 x half> @v2f16_return(<2 x float> %arg) #0 { +; GFX7-LABEL: v2f16_return: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %fptrunc = call <2 x half> @llvm.experimental.constrained.fptrunc.v2f16.v2f32(<2 x float> %arg, metadata !"round.tonearest", metadata !"fpexcept.strict") + ret <2 x half> %fptrunc +} + +define <3 x half> @v3f16_return(<3 x float> %arg) #0 { +; GFX7-LABEL: v3f16_return: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %fptrunc = call <3 x half> @llvm.experimental.constrained.fptrunc.v3f16.v3f32(<3 x float> %arg, metadata !"round.tonearest", metadata !"fpexcept.strict") + ret <3 x half> %fptrunc +} + +define <4 x half> @v4f16_return(<4 x float> %arg) #0 { +; GFX7-LABEL: v4f16_return: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %fptrunc = call <4 x half> @llvm.experimental.constrained.fptrunc.v4f16.v4f32(<4 x float> %arg, metadata !"round.tonearest", metadata !"fpexcept.strict") + ret <4 x half> %fptrunc +} + +define void @outgoing_f16_arg(ptr %ptr) #0 { +; GFX7-LABEL: outgoing_f16_arg: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s16, s33 +; GFX7-NEXT: s_mov_b32 s33, s32 +; GFX7-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX7-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX7-NEXT: s_mov_b64 exec, s[18:19] +; GFX7-NEXT: flat_load_ushort v0, v[0:1] +; GFX7-NEXT: v_writelane_b32 v40, s16, 2 +; GFX7-NEXT: v_writelane_b32 v40, s30, 0 +; GFX7-NEXT: s_mov_b32 s17, f16_user@abs32@hi +; GFX7-NEXT: s_mov_b32 s16, f16_user@abs32@lo +; GFX7-NEXT: s_addk_i32 s32, 0x400 +; GFX7-NEXT: v_writelane_b32 v40, s31, 1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7-NEXT: v_readlane_b32 s31, v40, 1 +; GFX7-NEXT: v_readlane_b32 s30, v40, 0 +; GFX7-NEXT: v_readlane_b32 s4, v40, 2 +; GFX7-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX7-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX7-NEXT: s_mov_b64 exec, s[6:7] +; GFX7-NEXT: s_addk_i32 s32, 0xfc00 +; GFX7-NEXT: s_mov_b32 s33, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] + %val = load half, ptr %ptr + call void @f16_user(half %val) + ret void +} + +define void @outgoing_v2f16_arg(ptr %ptr) #0 { +; GFX7-LABEL: outgoing_v2f16_arg: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s16, s33 +; GFX7-NEXT: s_mov_b32 s33, s32 +; GFX7-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX7-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX7-NEXT: s_mov_b64 exec, s[18:19] +; GFX7-NEXT: flat_load_dword v1, v[0:1] +; GFX7-NEXT: v_writelane_b32 v40, s16, 2 +; GFX7-NEXT: v_writelane_b32 v40, s30, 0 +; GFX7-NEXT: s_mov_b32 s17, v2f16_user@abs32@hi +; GFX7-NEXT: s_mov_b32 s16, v2f16_user@abs32@lo +; GFX7-NEXT: s_addk_i32 s32, 0x400 +; GFX7-NEXT: v_writelane_b32 v40, s31, 1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7-NEXT: v_readlane_b32 s31, v40, 1 +; GFX7-NEXT: v_readlane_b32 s30, v40, 0 +; GFX7-NEXT: v_readlane_b32 s4, v40, 2 +; GFX7-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX7-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX7-NEXT: s_mov_b64 exec, s[6:7] +; GFX7-NEXT: s_addk_i32 s32, 0xfc00 +; GFX7-NEXT: s_mov_b32 s33, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] + %val = load <2 x half>, ptr %ptr + call void @v2f16_user(<2 x half> %val) + ret void +} + +define void @outgoing_f16_return(ptr %ptr) #0 { +; GFX7-LABEL: outgoing_f16_return: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s16, s33 +; GFX7-NEXT: s_mov_b32 s33, s32 +; GFX7-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX7-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX7-NEXT: s_mov_b64 exec, s[18:19] +; GFX7-NEXT: v_writelane_b32 v40, s16, 2 +; GFX7-NEXT: v_writelane_b32 v40, s30, 0 +; GFX7-NEXT: s_mov_b32 s17, f16_result@abs32@hi +; GFX7-NEXT: s_mov_b32 s16, f16_result@abs32@lo +; GFX7-NEXT: s_addk_i32 s32, 0x400 +; GFX7-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX7-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX7-NEXT: v_writelane_b32 v40, s31, 1 +; GFX7-NEXT: v_mov_b32_e32 v42, v1 +; GFX7-NEXT: v_mov_b32_e32 v41, v0 +; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_readlane_b32 s31, v40, 1 +; GFX7-NEXT: v_readlane_b32 s30, v40, 0 +; GFX7-NEXT: v_readlane_b32 s4, v40, 2 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: flat_store_short v[41:42], v0 +; GFX7-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX7-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX7-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX7-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GFX7-NEXT: s_mov_b64 exec, s[6:7] +; GFX7-NEXT: s_addk_i32 s32, 0xfc00 +; GFX7-NEXT: s_mov_b32 s33, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] + %val = call half @f16_result() + store half %val, ptr %ptr + ret void +} + +define void @outgoing_v2f16_return(ptr %ptr) #0 { +; GFX7-LABEL: outgoing_v2f16_return: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s16, s33 +; GFX7-NEXT: s_mov_b32 s33, s32 +; GFX7-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX7-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX7-NEXT: s_mov_b64 exec, s[18:19] +; GFX7-NEXT: v_writelane_b32 v40, s16, 2 +; GFX7-NEXT: v_writelane_b32 v40, s30, 0 +; GFX7-NEXT: s_mov_b32 s17, v2f16_result@abs32@hi +; GFX7-NEXT: s_mov_b32 s16, v2f16_result@abs32@lo +; GFX7-NEXT: s_addk_i32 s32, 0x400 +; GFX7-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX7-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX7-NEXT: v_writelane_b32 v40, s31, 1 +; GFX7-NEXT: v_mov_b32_e32 v42, v1 +; GFX7-NEXT: v_mov_b32_e32 v41, v0 +; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_readlane_b32 s31, v40, 1 +; GFX7-NEXT: v_readlane_b32 s30, v40, 0 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_readlane_b32 s4, v40, 2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: flat_store_dword v[41:42], v0 +; GFX7-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX7-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX7-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX7-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GFX7-NEXT: s_mov_b64 exec, s[6:7] +; GFX7-NEXT: s_addk_i32 s32, 0xfc00 +; GFX7-NEXT: s_mov_b32 s33, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] + %val = call <2 x half> @v2f16_result() + store <2 x half> %val, ptr %ptr + ret void +} + +define void @outgoing_v4f16_return(ptr %ptr) #0 { +; GFX7-LABEL: outgoing_v4f16_return: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s16, s33 +; GFX7-NEXT: s_mov_b32 s33, s32 +; GFX7-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX7-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX7-NEXT: s_mov_b64 exec, s[18:19] +; GFX7-NEXT: v_writelane_b32 v40, s16, 2 +; GFX7-NEXT: v_writelane_b32 v40, s30, 0 +; GFX7-NEXT: s_mov_b32 s17, v4f16_result@abs32@hi +; GFX7-NEXT: s_mov_b32 s16, v4f16_result@abs32@lo +; GFX7-NEXT: s_addk_i32 s32, 0x400 +; GFX7-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX7-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX7-NEXT: v_writelane_b32 v40, s31, 1 +; GFX7-NEXT: v_mov_b32_e32 v42, v1 +; GFX7-NEXT: v_mov_b32_e32 v41, v0 +; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v4, v0, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v0 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 4, v41 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v42, vcc +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: flat_store_dword v[41:42], v4 +; GFX7-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX7-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX7-NEXT: v_readlane_b32 s31, v40, 1 +; GFX7-NEXT: v_readlane_b32 s30, v40, 0 +; GFX7-NEXT: v_readlane_b32 s4, v40, 2 +; GFX7-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX7-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GFX7-NEXT: s_mov_b64 exec, s[6:7] +; GFX7-NEXT: s_addk_i32 s32, 0xfc00 +; GFX7-NEXT: s_mov_b32 s33, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] + %val = call <4 x half> @v4f16_result() + store <4 x half> %val, ptr %ptr + ret void +} + +define void @outgoing_v8f16_return(ptr %ptr) #0 { +; GFX7-LABEL: outgoing_v8f16_return: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s16, s33 +; GFX7-NEXT: s_mov_b32 s33, s32 +; GFX7-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX7-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX7-NEXT: s_mov_b64 exec, s[18:19] +; GFX7-NEXT: v_writelane_b32 v40, s16, 2 +; GFX7-NEXT: v_writelane_b32 v40, s30, 0 +; GFX7-NEXT: s_mov_b32 s17, v8f16_result@abs32@hi +; GFX7-NEXT: s_mov_b32 s16, v8f16_result@abs32@lo +; GFX7-NEXT: s_addk_i32 s32, 0x400 +; GFX7-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX7-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX7-NEXT: v_writelane_b32 v40, s31, 1 +; GFX7-NEXT: v_mov_b32_e32 v42, v1 +; GFX7-NEXT: v_mov_b32_e32 v41, v0 +; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v8, v0, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v7 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v6 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v5, v1, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v3, v4, v0 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 12, v41 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v42, vcc +; GFX7-NEXT: flat_store_dword v[0:1], v3 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 8, v41 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v42, vcc +; GFX7-NEXT: flat_store_dword v[0:1], v5 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 4, v41 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v42, vcc +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: flat_store_dword v[41:42], v8 +; GFX7-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX7-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX7-NEXT: v_readlane_b32 s31, v40, 1 +; GFX7-NEXT: v_readlane_b32 s30, v40, 0 +; GFX7-NEXT: v_readlane_b32 s4, v40, 2 +; GFX7-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX7-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GFX7-NEXT: s_mov_b64 exec, s[6:7] +; GFX7-NEXT: s_addk_i32 s32, 0xfc00 +; GFX7-NEXT: s_mov_b32 s33, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] + %val = call <8 x half> @v8f16_result() + store <8 x half> %val, ptr %ptr + ret void +} + +define half @call_split_type_used_outside_block_v8f16() #0 { +; GFX7-LABEL: call_split_type_used_outside_block_v8f16: +; GFX7: ; %bb.0: ; %bb0 +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s16, s33 +; GFX7-NEXT: s_mov_b32 s33, s32 +; GFX7-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX7-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX7-NEXT: s_mov_b64 exec, s[18:19] +; GFX7-NEXT: v_writelane_b32 v40, s16, 2 +; GFX7-NEXT: v_writelane_b32 v40, s30, 0 +; GFX7-NEXT: s_mov_b32 s17, v8f16_result@abs32@hi +; GFX7-NEXT: s_mov_b32 s16, v8f16_result@abs32@lo +; GFX7-NEXT: s_addk_i32 s32, 0x400 +; GFX7-NEXT: v_writelane_b32 v40, s31, 1 +; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_readlane_b32 s31, v40, 1 +; GFX7-NEXT: v_readlane_b32 s30, v40, 0 +; GFX7-NEXT: v_readlane_b32 s4, v40, 2 +; GFX7-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX7-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX7-NEXT: s_mov_b64 exec, s[6:7] +; GFX7-NEXT: s_addk_i32 s32, 0xfc00 +; GFX7-NEXT: s_mov_b32 s33, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] +bb0: + %split.ret.type = call <8 x half> @v8f16_result() + br label %bb1 + +bb1: + %extract = extractelement <8 x half> %split.ret.type, i32 0 + ret half %extract +} + +declare float @llvm.experimental.constrained.fpext.f32.f16(half, metadata) #0 +declare <2 x float> @llvm.experimental.constrained.fpext.v2f32.v2f16(<2 x half>, metadata) #0 +declare <3 x float> @llvm.experimental.constrained.fpext.v3f32.v3f16(<3 x half>, metadata) #0 +declare <4 x float> @llvm.experimental.constrained.fpext.v4f32.v4f16(<4 x half>, metadata) #0 + +declare half @llvm.experimental.constrained.fptrunc.f16.f32(float, metadata, metadata) #0 +declare <2 x half> @llvm.experimental.constrained.fptrunc.v2f16.v2f32(<2 x float>, metadata, metadata) #0 +declare <3 x half> @llvm.experimental.constrained.fptrunc.v3f16.v3f32(<3 x float>, metadata, metadata) #0 +declare <4 x half> @llvm.experimental.constrained.fptrunc.v4f16.v4f32(<4 x float>, metadata, metadata) #0 + +attributes #0 = { strictfp } >From 9757953369fd2af02e60a8f35b1b79383ef474e6 Mon Sep 17 00:00:00 2001 From: Matt Arsenault <matthew.arsena...@amd.com> Date: Tue, 5 Dec 2023 09:57:59 +0700 Subject: [PATCH 2/2] DAG: Fix chain mismanagement in SoftenFloatRes_FP_EXTEND This would result in nodes not getting appropriately re-legalized in the strictfp case. --- .../SelectionDAG/LegalizeFloatTypes.cpp | 9 +- .../AArch64/strictfp_f16_abi_promote.ll | 344 ++++++++++++++++++ 2 files changed, 351 insertions(+), 2 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/strictfp_f16_abi_promote.ll diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp index 6e0e1e23419bec..2f916e27a587c3 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp @@ -522,8 +522,11 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FP_EXTEND(SDNode *N) { Op = GetPromotedFloat(Op); // If the promotion did the FP_EXTEND to the destination type for us, // there's nothing left to do here. - if (Op.getValueType() == N->getValueType(0)) + if (Op.getValueType() == N->getValueType(0)) { + if (IsStrict) + ReplaceValueWith(SDValue(N, 1), Chain); return BitConvertToInteger(Op); + } } // There's only a libcall for f16 -> f32 and shifting is only valid for bf16 @@ -541,8 +544,10 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FP_EXTEND(SDNode *N) { } } - if (Op.getValueType() == MVT::bf16) + if (Op.getValueType() == MVT::bf16) { + // FIXME: Need ReplaceValueWith on chain in strict case return SoftenFloatRes_BF16_TO_FP(N); + } RTLIB::Libcall LC = RTLIB::getFPEXT(Op.getValueType(), N->getValueType(0)); assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_EXTEND!"); diff --git a/llvm/test/CodeGen/AArch64/strictfp_f16_abi_promote.ll b/llvm/test/CodeGen/AArch64/strictfp_f16_abi_promote.ll new file mode 100644 index 00000000000000..37186cf22ccc71 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/strictfp_f16_abi_promote.ll @@ -0,0 +1,344 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 < %s | FileCheck -check-prefix=NOFP16 %s + +declare void @f16_user(half) +declare half @f16_result() + +declare void @v2f16_user(<2 x half>) +declare <2 x half> @v2f16_result() + +declare void @v4f16_user(<4 x half>) +declare <4 x half> @v4f16_result() + +declare void @v8f16_user(<8 x half>) +declare <8 x half> @v8f16_result() + +define void @f16_arg(half %arg, ptr %ptr) #0 { +; NOFP16-LABEL: f16_arg: +; NOFP16: // %bb.0: +; NOFP16-NEXT: stp x30, x19, [sp, #-16]! // 16-byte Folded Spill +; NOFP16-NEXT: .cfi_def_cfa_offset 16 +; NOFP16-NEXT: .cfi_offset w19, -8 +; NOFP16-NEXT: .cfi_offset w30, -16 +; NOFP16-NEXT: and w0, w0, #0xffff +; NOFP16-NEXT: mov x19, x1 +; NOFP16-NEXT: bl __gnu_h2f_ieee +; NOFP16-NEXT: str w0, [x19] +; NOFP16-NEXT: ldp x30, x19, [sp], #16 // 16-byte Folded Reload +; NOFP16-NEXT: ret + %fpext = call float @llvm.experimental.constrained.fpext.f32.f16(half %arg, metadata !"fpexcept.strict") + store float %fpext, ptr %ptr + ret void +} + +define void @v2f16_arg(<2 x half> %arg, ptr %ptr) #0 { +; NOFP16-LABEL: v2f16_arg: +; NOFP16: // %bb.0: +; NOFP16-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill +; NOFP16-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; NOFP16-NEXT: .cfi_def_cfa_offset 32 +; NOFP16-NEXT: .cfi_offset w19, -8 +; NOFP16-NEXT: .cfi_offset w20, -16 +; NOFP16-NEXT: .cfi_offset w21, -24 +; NOFP16-NEXT: .cfi_offset w30, -32 +; NOFP16-NEXT: and w0, w0, #0xffff +; NOFP16-NEXT: mov x19, x2 +; NOFP16-NEXT: mov w20, w1 +; NOFP16-NEXT: bl __gnu_h2f_ieee +; NOFP16-NEXT: mov w21, w0 +; NOFP16-NEXT: and w0, w20, #0xffff +; NOFP16-NEXT: bl __gnu_h2f_ieee +; NOFP16-NEXT: stp w21, w0, [x19] +; NOFP16-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; NOFP16-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload +; NOFP16-NEXT: ret + %fpext = call <2 x float> @llvm.experimental.constrained.fpext.v2f32.v2f16(<2 x half> %arg, metadata !"fpexcept.strict") + store <2 x float> %fpext, ptr %ptr + ret void +} + +define void @v3f16_arg(<3 x half> %arg, ptr %ptr) #0 { +; NOFP16-LABEL: v3f16_arg: +; NOFP16: // %bb.0: +; NOFP16-NEXT: str x30, [sp, #-48]! // 8-byte Folded Spill +; NOFP16-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill +; NOFP16-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; NOFP16-NEXT: .cfi_def_cfa_offset 48 +; NOFP16-NEXT: .cfi_offset w19, -8 +; NOFP16-NEXT: .cfi_offset w20, -16 +; NOFP16-NEXT: .cfi_offset w21, -24 +; NOFP16-NEXT: .cfi_offset w22, -32 +; NOFP16-NEXT: .cfi_offset w30, -48 +; NOFP16-NEXT: mov w21, w0 +; NOFP16-NEXT: and w0, w2, #0xffff +; NOFP16-NEXT: mov x19, x3 +; NOFP16-NEXT: mov w20, w1 +; NOFP16-NEXT: bl __gnu_h2f_ieee +; NOFP16-NEXT: mov w22, w0 +; NOFP16-NEXT: and w0, w21, #0xffff +; NOFP16-NEXT: bl __gnu_h2f_ieee +; NOFP16-NEXT: mov w21, w0 +; NOFP16-NEXT: and w0, w20, #0xffff +; NOFP16-NEXT: bl __gnu_h2f_ieee +; NOFP16-NEXT: mov w8, w21 +; NOFP16-NEXT: // kill: def $w0 killed $w0 def $x0 +; NOFP16-NEXT: str w22, [x19, #8] +; NOFP16-NEXT: orr x8, x8, x0, lsl #32 +; NOFP16-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload +; NOFP16-NEXT: str x8, [x19] +; NOFP16-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload +; NOFP16-NEXT: ldr x30, [sp], #48 // 8-byte Folded Reload +; NOFP16-NEXT: ret + %fpext = call <3 x float> @llvm.experimental.constrained.fpext.v3f32.v3f16(<3 x half> %arg, metadata !"fpexcept.strict") + store <3 x float> %fpext, ptr %ptr + ret void +} + +define void @v4f16_arg(<4 x half> %arg, ptr %ptr) #0 { +; NOFP16-LABEL: v4f16_arg: +; NOFP16: // %bb.0: +; NOFP16-NEXT: stp x30, x23, [sp, #-48]! // 16-byte Folded Spill +; NOFP16-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill +; NOFP16-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; NOFP16-NEXT: .cfi_def_cfa_offset 48 +; NOFP16-NEXT: .cfi_offset w19, -8 +; NOFP16-NEXT: .cfi_offset w20, -16 +; NOFP16-NEXT: .cfi_offset w21, -24 +; NOFP16-NEXT: .cfi_offset w22, -32 +; NOFP16-NEXT: .cfi_offset w23, -40 +; NOFP16-NEXT: .cfi_offset w30, -48 +; NOFP16-NEXT: and w0, w0, #0xffff +; NOFP16-NEXT: mov x19, x4 +; NOFP16-NEXT: mov w20, w3 +; NOFP16-NEXT: mov w21, w2 +; NOFP16-NEXT: mov w22, w1 +; NOFP16-NEXT: bl __gnu_h2f_ieee +; NOFP16-NEXT: mov w23, w0 +; NOFP16-NEXT: and w0, w22, #0xffff +; NOFP16-NEXT: bl __gnu_h2f_ieee +; NOFP16-NEXT: mov w22, w0 +; NOFP16-NEXT: and w0, w21, #0xffff +; NOFP16-NEXT: bl __gnu_h2f_ieee +; NOFP16-NEXT: mov w21, w0 +; NOFP16-NEXT: and w0, w20, #0xffff +; NOFP16-NEXT: bl __gnu_h2f_ieee +; NOFP16-NEXT: stp w21, w0, [x19, #8] +; NOFP16-NEXT: stp w23, w22, [x19] +; NOFP16-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload +; NOFP16-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload +; NOFP16-NEXT: ldp x30, x23, [sp], #48 // 16-byte Folded Reload +; NOFP16-NEXT: ret + %fpext = call <4 x float> @llvm.experimental.constrained.fpext.v4f32.v4f16(<4 x half> %arg, metadata !"fpexcept.strict") + store <4 x float> %fpext, ptr %ptr + ret void +} + +; FIXME: +; define half @f16_return(float %arg) #0 { +; %fptrunc = call half @llvm.experimental.constrained.fptrunc.f16.f32(float %arg, metadata !"round.tonearest", metadata !"fpexcept.strict") +; ret half %fptrunc +; } + +; define <2 x half> @v2f16_return(<2 x float> %arg) #0 { +; %fptrunc = call <2 x half> @llvm.experimental.constrained.fptrunc.v2f16.v2f32(<2 x float> %arg, metadata !"round.tonearest", metadata !"fpexcept.strict") +; ret <2 x half> %fptrunc +; } + +; define <3 x half> @v3f16_return(<3 x float> %arg) #0 { +; %fptrunc = call <3 x half> @llvm.experimental.constrained.fptrunc.v3f16.v3f32(<3 x float> %arg, metadata !"round.tonearest", metadata !"fpexcept.strict") +; ret <3 x half> %fptrunc +; } + +; define <4 x half> @v4f16_return(<4 x float> %arg) #0 { +; %fptrunc = call <4 x half> @llvm.experimental.constrained.fptrunc.v4f16.v4f32(<4 x float> %arg, metadata !"round.tonearest", metadata !"fpexcept.strict") +; ret <4 x half> %fptrunc +; } + +; FIXME: +; define void @outgoing_f16_arg(ptr %ptr) #0 { +; %val = load half, ptr %ptr +; call void @f16_user(half %val) +; ret void +; } + +; define void @outgoing_v2f16_arg(ptr %ptr) #0 { +; %val = load <2 x half>, ptr %ptr +; call void @v2f16_user(<2 x half> %val) +; ret void +; } + +; define void @outgoing_f16_return(ptr %ptr) #0 { +; %val = call half @f16_result() +; store half %val, ptr %ptr +; ret void +; } + +; define void @outgoing_v2f16_return(ptr %ptr) #0 { +; %val = call <2 x half> @v2f16_result() +; store <2 x half> %val, ptr %ptr +; ret void +; } + +define void @outgoing_v4f16_return(ptr %ptr) #0 { +; NOFP16-LABEL: outgoing_v4f16_return: +; NOFP16: // %bb.0: +; NOFP16-NEXT: stp x30, x23, [sp, #-48]! // 16-byte Folded Spill +; NOFP16-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill +; NOFP16-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; NOFP16-NEXT: .cfi_def_cfa_offset 48 +; NOFP16-NEXT: .cfi_offset w19, -8 +; NOFP16-NEXT: .cfi_offset w20, -16 +; NOFP16-NEXT: .cfi_offset w21, -24 +; NOFP16-NEXT: .cfi_offset w22, -32 +; NOFP16-NEXT: .cfi_offset w23, -40 +; NOFP16-NEXT: .cfi_offset w30, -48 +; NOFP16-NEXT: mov x19, x0 +; NOFP16-NEXT: bl v4f16_result +; NOFP16-NEXT: and w0, w0, #0xffff +; NOFP16-NEXT: mov w20, w1 +; NOFP16-NEXT: mov w21, w2 +; NOFP16-NEXT: mov w22, w3 +; NOFP16-NEXT: bl __gnu_h2f_ieee +; NOFP16-NEXT: mov w23, w0 +; NOFP16-NEXT: and w0, w20, #0xffff +; NOFP16-NEXT: bl __gnu_h2f_ieee +; NOFP16-NEXT: mov w20, w0 +; NOFP16-NEXT: and w0, w21, #0xffff +; NOFP16-NEXT: bl __gnu_h2f_ieee +; NOFP16-NEXT: mov w21, w0 +; NOFP16-NEXT: and w0, w22, #0xffff +; NOFP16-NEXT: bl __gnu_h2f_ieee +; NOFP16-NEXT: bl __gnu_f2h_ieee +; NOFP16-NEXT: strh w0, [x19, #6] +; NOFP16-NEXT: mov w0, w21 +; NOFP16-NEXT: bl __gnu_f2h_ieee +; NOFP16-NEXT: strh w0, [x19, #4] +; NOFP16-NEXT: mov w0, w20 +; NOFP16-NEXT: bl __gnu_f2h_ieee +; NOFP16-NEXT: strh w0, [x19, #2] +; NOFP16-NEXT: mov w0, w23 +; NOFP16-NEXT: bl __gnu_f2h_ieee +; NOFP16-NEXT: strh w0, [x19] +; NOFP16-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload +; NOFP16-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload +; NOFP16-NEXT: ldp x30, x23, [sp], #48 // 16-byte Folded Reload +; NOFP16-NEXT: ret + %val = call <4 x half> @v4f16_result() + store <4 x half> %val, ptr %ptr + ret void +} + +define void @outgoing_v8f16_return(ptr %ptr) #0 { +; NOFP16-LABEL: outgoing_v8f16_return: +; NOFP16: // %bb.0: +; NOFP16-NEXT: stp x30, x27, [sp, #-80]! // 16-byte Folded Spill +; NOFP16-NEXT: stp x26, x25, [sp, #16] // 16-byte Folded Spill +; NOFP16-NEXT: stp x24, x23, [sp, #32] // 16-byte Folded Spill +; NOFP16-NEXT: stp x22, x21, [sp, #48] // 16-byte Folded Spill +; NOFP16-NEXT: stp x20, x19, [sp, #64] // 16-byte Folded Spill +; NOFP16-NEXT: .cfi_def_cfa_offset 80 +; NOFP16-NEXT: .cfi_offset w19, -8 +; NOFP16-NEXT: .cfi_offset w20, -16 +; NOFP16-NEXT: .cfi_offset w21, -24 +; NOFP16-NEXT: .cfi_offset w22, -32 +; NOFP16-NEXT: .cfi_offset w23, -40 +; NOFP16-NEXT: .cfi_offset w24, -48 +; NOFP16-NEXT: .cfi_offset w25, -56 +; NOFP16-NEXT: .cfi_offset w26, -64 +; NOFP16-NEXT: .cfi_offset w27, -72 +; NOFP16-NEXT: .cfi_offset w30, -80 +; NOFP16-NEXT: mov x19, x0 +; NOFP16-NEXT: bl v8f16_result +; NOFP16-NEXT: and w0, w0, #0xffff +; NOFP16-NEXT: mov w21, w1 +; NOFP16-NEXT: mov w22, w2 +; NOFP16-NEXT: mov w23, w3 +; NOFP16-NEXT: mov w24, w4 +; NOFP16-NEXT: mov w25, w5 +; NOFP16-NEXT: mov w26, w6 +; NOFP16-NEXT: mov w27, w7 +; NOFP16-NEXT: bl __gnu_h2f_ieee +; NOFP16-NEXT: mov w20, w0 +; NOFP16-NEXT: and w0, w21, #0xffff +; NOFP16-NEXT: bl __gnu_h2f_ieee +; NOFP16-NEXT: mov w21, w0 +; NOFP16-NEXT: and w0, w22, #0xffff +; NOFP16-NEXT: bl __gnu_h2f_ieee +; NOFP16-NEXT: mov w22, w0 +; NOFP16-NEXT: and w0, w23, #0xffff +; NOFP16-NEXT: bl __gnu_h2f_ieee +; NOFP16-NEXT: mov w23, w0 +; NOFP16-NEXT: and w0, w24, #0xffff +; NOFP16-NEXT: bl __gnu_h2f_ieee +; NOFP16-NEXT: mov w24, w0 +; NOFP16-NEXT: and w0, w25, #0xffff +; NOFP16-NEXT: bl __gnu_h2f_ieee +; NOFP16-NEXT: mov w25, w0 +; NOFP16-NEXT: and w0, w26, #0xffff +; NOFP16-NEXT: bl __gnu_h2f_ieee +; NOFP16-NEXT: mov w26, w0 +; NOFP16-NEXT: and w0, w27, #0xffff +; NOFP16-NEXT: bl __gnu_h2f_ieee +; NOFP16-NEXT: bl __gnu_f2h_ieee +; NOFP16-NEXT: strh w0, [x19, #14] +; NOFP16-NEXT: mov w0, w26 +; NOFP16-NEXT: bl __gnu_f2h_ieee +; NOFP16-NEXT: strh w0, [x19, #12] +; NOFP16-NEXT: mov w0, w25 +; NOFP16-NEXT: bl __gnu_f2h_ieee +; NOFP16-NEXT: strh w0, [x19, #10] +; NOFP16-NEXT: mov w0, w24 +; NOFP16-NEXT: bl __gnu_f2h_ieee +; NOFP16-NEXT: strh w0, [x19, #8] +; NOFP16-NEXT: mov w0, w23 +; NOFP16-NEXT: bl __gnu_f2h_ieee +; NOFP16-NEXT: strh w0, [x19, #6] +; NOFP16-NEXT: mov w0, w22 +; NOFP16-NEXT: bl __gnu_f2h_ieee +; NOFP16-NEXT: strh w0, [x19, #4] +; NOFP16-NEXT: mov w0, w21 +; NOFP16-NEXT: bl __gnu_f2h_ieee +; NOFP16-NEXT: strh w0, [x19, #2] +; NOFP16-NEXT: mov w0, w20 +; NOFP16-NEXT: bl __gnu_f2h_ieee +; NOFP16-NEXT: strh w0, [x19] +; NOFP16-NEXT: ldp x20, x19, [sp, #64] // 16-byte Folded Reload +; NOFP16-NEXT: ldp x22, x21, [sp, #48] // 16-byte Folded Reload +; NOFP16-NEXT: ldp x24, x23, [sp, #32] // 16-byte Folded Reload +; NOFP16-NEXT: ldp x26, x25, [sp, #16] // 16-byte Folded Reload +; NOFP16-NEXT: ldp x30, x27, [sp], #80 // 16-byte Folded Reload +; NOFP16-NEXT: ret + %val = call <8 x half> @v8f16_result() + store <8 x half> %val, ptr %ptr + ret void +} + +define half @call_split_type_used_outside_block_v8f16() #0 { +; NOFP16-LABEL: call_split_type_used_outside_block_v8f16: +; NOFP16: // %bb.0: // %bb0 +; NOFP16-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; NOFP16-NEXT: .cfi_def_cfa_offset 16 +; NOFP16-NEXT: .cfi_offset w30, -16 +; NOFP16-NEXT: bl v8f16_result +; NOFP16-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; NOFP16-NEXT: ret +bb0: + %split.ret.type = call <8 x half> @v8f16_result() + br label %bb1 + +bb1: + %extract = extractelement <8 x half> %split.ret.type, i32 0 + ret half %extract +} + +declare float @llvm.experimental.constrained.fpext.f32.f16(half, metadata) #0 +declare <2 x float> @llvm.experimental.constrained.fpext.v2f32.v2f16(<2 x half>, metadata) #0 +declare <3 x float> @llvm.experimental.constrained.fpext.v3f32.v3f16(<3 x half>, metadata) #0 +declare <4 x float> @llvm.experimental.constrained.fpext.v4f32.v4f16(<4 x half>, metadata) #0 + +declare half @llvm.experimental.constrained.fptrunc.f16.f32(float, metadata, metadata) #0 +declare <2 x half> @llvm.experimental.constrained.fptrunc.v2f16.v2f32(<2 x float>, metadata, metadata) #0 +declare <3 x half> @llvm.experimental.constrained.fptrunc.v3f16.v3f32(<3 x float>, metadata, metadata) #0 +declare <4 x half> @llvm.experimental.constrained.fptrunc.v4f16.v4f32(<4 x float>, metadata, metadata) #0 + +attributes #0 = { strictfp } _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits