Author: Matt Arsenault Date: 2021-01-06T12:32:01-05:00 New Revision: ab3a3f543b18d36cec98faa9ca2a68cc9a6ecc65
URL: https://github.com/llvm/llvm-project/commit/ab3a3f543b18d36cec98faa9ca2a68cc9a6ecc65 DIFF: https://github.com/llvm/llvm-project/commit/ab3a3f543b18d36cec98faa9ca2a68cc9a6ecc65.diff LOG: AMDGPU/GlobalISel: Update fdiv lowering for denormal/ulp interaction Change the GlobalISel fast fdiv handling to match the changes in 2531535984ad989ce88aeee23cb92a827da6686e and 884acbb9e167d5668e43581630239d688edec8ad Added: Modified: llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fdiv.mir Removed: ################################################################################ diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 8c733a2afa03..a8e6f27e032b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -3053,22 +3053,14 @@ bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, Register Res = MI.getOperand(0).getReg(); Register LHS = MI.getOperand(1).getReg(); Register RHS = MI.getOperand(2).getReg(); - uint16_t Flags = MI.getFlags(); - LLT ResTy = MRI.getType(Res); - LLT S32 = LLT::scalar(32); - LLT S64 = LLT::scalar(64); const MachineFunction &MF = B.getMF(); - bool Unsafe = - MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp); + bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || + MI.getFlag(MachineInstr::FmAfn); - if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64) - return false; - - if (!Unsafe && ResTy == S32 && - MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals()) + if (!AllowInaccurateRcp) return false; if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { @@ -3095,17 +3087,13 @@ bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, } // x / y -> x * (1.0 / y) - if (Unsafe) { - auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) - .addUse(RHS) - .setMIFlags(Flags); - B.buildFMul(Res, LHS, RCP, Flags); - - MI.eraseFromParent(); - return true; - } + auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) + .addUse(RHS) + .setMIFlags(Flags); + B.buildFMul(Res, LHS, RCP, Flags); - return false; + MI.eraseFromParent(); + return true; } bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll index 7775789bd0d2..c7b9b4f60bc6 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll @@ -66,7 +66,28 @@ define half @v_fdiv_f16(half %a, half %b) { } define half @v_fdiv_f16_afn(half %a, half %b) { -; GFX6-IEEE-LABEL: v_fdiv_f16_afn: +; GFX6-LABEL: v_fdiv_f16_afn: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_rcp_f32_e32 v1, v1 +; GFX6-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX89-LABEL: v_fdiv_f16_afn: +; GFX89: ; %bb.0: +; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX89-NEXT: v_rcp_f16_e32 v1, v1 +; GFX89-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX89-NEXT: s_setpc_b64 s[30:31] + %fdiv = fdiv afn half %a, %b + ret half %fdiv +} + +define half @v_fdiv_f16_ulp25(half %a, half %b) { +; GFX6-IEEE-LABEL: v_fdiv_f16_ulp25: ; GFX6-IEEE: ; %bb.0: ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 @@ -85,7 +106,7 @@ define half @v_fdiv_f16_afn(half %a, half %b) { ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-FLUSH-LABEL: v_fdiv_f16_afn: +; GFX6-FLUSH-LABEL: v_fdiv_f16_ulp25: ; GFX6-FLUSH: ; %bb.0: ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 @@ -107,25 +128,29 @@ define half @v_fdiv_f16_afn(half %a, half %b) { ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; -; GFX89-LABEL: v_fdiv_f16_afn: +; GFX89-LABEL: v_fdiv_f16_ulp25: ; GFX89: ; %bb.0: ; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX89-NEXT: v_rcp_f16_e32 v1, v1 -; GFX89-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX89-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX89-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX89-NEXT: v_rcp_f32_e32 v2, v2 +; GFX89-NEXT: v_mul_f32_e32 v2, v3, v2 +; GFX89-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX89-NEXT: v_div_fixup_f16 v0, v2, v1, v0 ; GFX89-NEXT: s_setpc_b64 s[30:31] - %fdiv = fdiv afn half %a, %b + %fdiv = fdiv half %a, %b, !fpmath !0 ret half %fdiv } -define half @v_fdiv_f16_ulp25(half %a, half %b) { -; GFX6-IEEE-LABEL: v_fdiv_f16_ulp25: +define half @v_rcp_f16(half %x) { +; GFX6-IEEE-LABEL: v_rcp_f16: ; GFX6-IEEE: ; %bb.0: ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1 ; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 ; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3 ; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 @@ -133,18 +158,18 @@ define half @v_fdiv_f16_ulp25(half %a, half %b) { ; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5 ; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, v0 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-FLUSH-LABEL: v_fdiv_f16_ulp25: +; GFX6-FLUSH-LABEL: v_rcp_f16: ; GFX6-FLUSH: ; %bb.0: ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 ; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 @@ -154,27 +179,27 @@ define half @v_fdiv_f16_ulp25(half %a, half %b) { ; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v1, v0 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; -; GFX89-LABEL: v_fdiv_f16_ulp25: +; GFX89-LABEL: v_rcp_f16: ; GFX89: ; %bb.0: ; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX89-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GFX89-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GFX89-NEXT: v_rcp_f32_e32 v2, v2 -; GFX89-NEXT: v_mul_f32_e32 v2, v3, v2 -; GFX89-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX89-NEXT: v_div_fixup_f16 v0, v2, v1, v0 +; GFX89-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GFX89-NEXT: v_cvt_f32_f16_e32 v2, 1.0 +; GFX89-NEXT: v_rcp_f32_e32 v1, v1 +; GFX89-NEXT: v_mul_f32_e32 v1, v2, v1 +; GFX89-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX89-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 ; GFX89-NEXT: s_setpc_b64 s[30:31] - %fdiv = fdiv half %a, %b, !fpmath !0 + %fdiv = fdiv half 1.0, %x ret half %fdiv } -define half @v_rcp_f16(half %x) { -; GFX6-IEEE-LABEL: v_rcp_f16: +define half @v_rcp_f16_arcp(half %x) { +; GFX6-IEEE-LABEL: v_rcp_f16_arcp: ; GFX6-IEEE: ; %bb.0: ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0 @@ -193,7 +218,7 @@ define half @v_rcp_f16(half %x) { ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-FLUSH-LABEL: v_rcp_f16: +; GFX6-FLUSH-LABEL: v_rcp_f16_arcp: ; GFX6-FLUSH: ; %bb.0: ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0 @@ -215,30 +240,15 @@ define half @v_rcp_f16(half %x) { ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; -; GFX89-LABEL: v_rcp_f16: -; GFX89: ; %bb.0: -; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX89-NEXT: v_rcp_f16_e32 v0, v0 -; GFX89-NEXT: s_setpc_b64 s[30:31] - %fdiv = fdiv half 1.0, %x - ret half %fdiv -} - -define half @v_rcp_f16_arcp(half %x) { -; GFX6-LABEL: v_rcp_f16_arcp: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, 1.0 -; GFX6-NEXT: v_rcp_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_f32_e32 v0, v1, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: s_setpc_b64 s[30:31] -; ; GFX89-LABEL: v_rcp_f16_arcp: ; GFX89: ; %bb.0: ; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX89-NEXT: v_rcp_f16_e32 v0, v0 +; GFX89-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GFX89-NEXT: v_cvt_f32_f16_e32 v2, 1.0 +; GFX89-NEXT: v_rcp_f32_e32 v1, v1 +; GFX89-NEXT: v_mul_f32_e32 v1, v2, v1 +; GFX89-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX89-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 ; GFX89-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv arcp half 1.0, %x ret half %fdiv @@ -316,7 +326,28 @@ define half @v_rcp_f16_ulp25(half %x) { } define half @v_fdiv_f16_afn_ulp25(half %a, half %b) { -; GFX6-IEEE-LABEL: v_fdiv_f16_afn_ulp25: +; GFX6-LABEL: v_fdiv_f16_afn_ulp25: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_rcp_f32_e32 v1, v1 +; GFX6-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX89-LABEL: v_fdiv_f16_afn_ulp25: +; GFX89: ; %bb.0: +; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX89-NEXT: v_rcp_f16_e32 v1, v1 +; GFX89-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX89-NEXT: s_setpc_b64 s[30:31] + %fdiv = fdiv afn half %a, %b, !fpmath !0 + ret half %fdiv +} + +define half @v_fdiv_f16_arcp_ulp25(half %a, half %b) { +; GFX6-IEEE-LABEL: v_fdiv_f16_arcp_ulp25: ; GFX6-IEEE: ; %bb.0: ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 @@ -335,7 +366,7 @@ define half @v_fdiv_f16_afn_ulp25(half %a, half %b) { ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-FLUSH-LABEL: v_fdiv_f16_afn_ulp25: +; GFX6-FLUSH-LABEL: v_fdiv_f16_arcp_ulp25: ; GFX6-FLUSH: ; %bb.0: ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 @@ -357,32 +388,15 @@ define half @v_fdiv_f16_afn_ulp25(half %a, half %b) { ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; -; GFX89-LABEL: v_fdiv_f16_afn_ulp25: -; GFX89: ; %bb.0: -; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX89-NEXT: v_rcp_f16_e32 v1, v1 -; GFX89-NEXT: v_mul_f16_e32 v0, v0, v1 -; GFX89-NEXT: s_setpc_b64 s[30:31] - %fdiv = fdiv afn half %a, %b, !fpmath !0 - ret half %fdiv -} - -define half @v_fdiv_f16_arcp_ulp25(half %a, half %b) { -; GFX6-LABEL: v_fdiv_f16_arcp_ulp25: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: v_rcp_f32_e32 v1, v1 -; GFX6-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: s_setpc_b64 s[30:31] -; ; GFX89-LABEL: v_fdiv_f16_arcp_ulp25: ; GFX89: ; %bb.0: ; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX89-NEXT: v_rcp_f16_e32 v1, v1 -; GFX89-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX89-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX89-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX89-NEXT: v_rcp_f32_e32 v2, v2 +; GFX89-NEXT: v_mul_f32_e32 v2, v3, v2 +; GFX89-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX89-NEXT: v_div_fixup_f16 v0, v2, v1, v0 ; GFX89-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv arcp half %a, %b, !fpmath !0 ret half %fdiv @@ -508,76 +522,20 @@ define <2 x half> @v_fdiv_v2f16(<2 x half> %a, <2 x half> %b) { } define <2 x half> @v_fdiv_v2f16_afn(<2 x half> %a, <2 x half> %b) { -; GFX6-IEEE-LABEL: v_fdiv_v2f16_afn: -; GFX6-IEEE: ; %bb.0: -; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-IEEE-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v5, v4 -; GFX6-IEEE-NEXT: v_div_scale_f32 v6, vcc, v0, v2, v0 -; GFX6-IEEE-NEXT: v_fma_f32 v7, -v4, v5, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v5, v7, v5, v5 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v7, v6, v5 -; GFX6-IEEE-NEXT: v_fma_f32 v8, -v4, v7, v6 -; GFX6-IEEE-NEXT: v_fma_f32 v7, v8, v5, v7 -; GFX6-IEEE-NEXT: v_fma_f32 v4, -v4, v7, v6 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v4, v4, v5, v7 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0 -; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v3, v3, v1 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v2 -; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v1, v3, v1 -; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v4, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4 -; GFX6-IEEE-NEXT: v_fma_f32 v7, -v2, v6, v5 -; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6 -; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v6, v5 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v4, v6 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v3, v1 -; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] -; -; GFX6-FLUSH-LABEL: v_fdiv_v2f16_afn: -; GFX6-FLUSH: ; %bb.0: -; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0 -; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v5, v4 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v6, vcc, v0, v2, v0 -; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v4, v5, 1.0 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, v7, v5, v5 -; GFX6-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v5 -; GFX6-FLUSH-NEXT: v_fma_f32 v8, -v4, v7, v6 -; GFX6-FLUSH-NEXT: v_fma_f32 v7, v8, v5, v7 -; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v4, v7, v6 -; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v4, v4, v5, v7 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v4, v2, v0 -; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 -; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v3, v3, v1 -; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v2 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v1, v3, v1 -; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v4, 1.0 -; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4 -; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4 -; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6 -; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5 -; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v4, v6 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v3, v1 -; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] +; GFX6-LABEL: v_fdiv_v2f16_afn: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_rcp_f32_e32 v2, v2 +; GFX6-NEXT: v_rcp_f32_e32 v3, v3 +; GFX6-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX6-NEXT: v_mul_f32_e32 v1, v1, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fdiv_v2f16_afn: ; GFX8: ; %bb.0: @@ -799,8 +757,18 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) { ; GFX8-LABEL: v_rcp_v2f16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_rcp_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX8-NEXT: v_rcp_f16_e32 v0, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX8-NEXT: v_cvt_f32_f16_e32 v4, 1.0 +; GFX8-NEXT: v_rcp_f32_e32 v1, v1 +; GFX8-NEXT: v_rcp_f32_e32 v3, v3 +; GFX8-NEXT: v_mul_f32_e32 v1, v4, v1 +; GFX8-NEXT: v_mul_f32_e32 v3, v4, v3 +; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX8-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 +; GFX8-NEXT: v_div_fixup_f16 v1, v3, v2, 1.0 ; GFX8-NEXT: v_mov_b32_e32 v2, 16 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -809,35 +777,113 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) { ; GFX9-LABEL: v_rcp_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_rcp_f16_e32 v1, v0 -; GFX9-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX9-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX9-NEXT: v_cvt_f32_f16_e32 v4, 1.0 +; GFX9-NEXT: v_rcp_f32_e32 v1, v1 +; GFX9-NEXT: v_rcp_f32_e32 v3, v3 +; GFX9-NEXT: v_mul_f32_e32 v1, v4, v1 +; GFX9-NEXT: v_mul_f32_e32 v3, v4, v3 +; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX9-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 +; GFX9-NEXT: v_div_fixup_f16 v1, v3, v2, 1.0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX9-NEXT: v_and_or_b32 v0, v1, v2, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv <2 x half> <half 1.0, half 1.0>, %x ret <2 x half> %fdiv } define <2 x half> @v_rcp_v2f16_arcp(<2 x half> %x) { -; GFX6-LABEL: v_rcp_v2f16_arcp: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, 1.0 -; GFX6-NEXT: v_rcp_f32_e32 v0, v0 -; GFX6-NEXT: v_rcp_f32_e32 v1, v1 -; GFX6-NEXT: v_mul_f32_e32 v0, v2, v0 -; GFX6-NEXT: v_mul_f32_e32 v1, v2, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: s_setpc_b64 s[30:31] +; GFX6-IEEE-LABEL: v_rcp_v2f16_arcp: +; GFX6-IEEE: ; %bb.0: +; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, 1.0 +; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3 +; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2 +; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5 +; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6 +; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v2 +; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v2 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3 +; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v1, v2 +; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5 +; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6 +; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, v2 +; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-FLUSH-LABEL: v_rcp_v2f16_arcp: +; GFX6-FLUSH: ; %bb.0: +; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-FLUSH-NEXT: s_movk_i32 s6, 0x3c00 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, s6 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2 +; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2 +; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6 +; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5 +; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, s6 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, v2 +; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 +; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v4 +; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v1, v4 +; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v3, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v3, v6, v3, v3 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v3, v6 +; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5 +; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v4 +; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_rcp_v2f16_arcp: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_rcp_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX8-NEXT: v_rcp_f16_e32 v0, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX8-NEXT: v_cvt_f32_f16_e32 v4, 1.0 +; GFX8-NEXT: v_rcp_f32_e32 v1, v1 +; GFX8-NEXT: v_rcp_f32_e32 v3, v3 +; GFX8-NEXT: v_mul_f32_e32 v1, v4, v1 +; GFX8-NEXT: v_mul_f32_e32 v3, v4, v3 +; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX8-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 +; GFX8-NEXT: v_div_fixup_f16 v1, v3, v2, 1.0 ; GFX8-NEXT: v_mov_b32_e32 v2, 16 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -846,10 +892,21 @@ define <2 x half> @v_rcp_v2f16_arcp(<2 x half> %x) { ; GFX9-LABEL: v_rcp_v2f16_arcp: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_rcp_f16_e32 v1, v0 -; GFX9-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX9-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX9-NEXT: v_cvt_f32_f16_e32 v4, 1.0 +; GFX9-NEXT: v_rcp_f32_e32 v1, v1 +; GFX9-NEXT: v_rcp_f32_e32 v3, v3 +; GFX9-NEXT: v_mul_f32_e32 v1, v4, v1 +; GFX9-NEXT: v_mul_f32_e32 v3, v4, v3 +; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX9-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 +; GFX9-NEXT: v_div_fixup_f16 v1, v3, v2, 1.0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX9-NEXT: v_and_or_b32 v0, v1, v2, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv arcp <2 x half> <half 1.0, half 1.0>, %x ret <2 x half> %fdiv @@ -987,7 +1044,49 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) { } define <2 x half> @v_fdiv_v2f16_afn_ulp25(<2 x half> %a, <2 x half> %b) { -; GFX6-IEEE-LABEL: v_fdiv_v2f16_afn_ulp25: +; GFX6-LABEL: v_fdiv_v2f16_afn_ulp25: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_rcp_f32_e32 v2, v2 +; GFX6-NEXT: v_rcp_f32_e32 v3, v3 +; GFX6-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX6-NEXT: v_mul_f32_e32 v1, v1, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fdiv_v2f16_afn_ulp25: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_rcp_f16_e32 v2, v1 +; GFX8-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX8-NEXT: v_mul_f16_e32 v2, v0, v2 +; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_mov_b32_e32 v1, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fdiv_v2f16_afn_ulp25: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_rcp_f16_e32 v2, v1 +; GFX9-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9-NEXT: v_mul_f16_e32 v2, v0, v2 +; GFX9-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX9-NEXT: v_and_or_b32 v0, v2, v1, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %fdiv = fdiv afn <2 x half> %a, %b, !fpmath !0 + ret <2 x half> %fdiv +} + +define <2 x half> @v_fdiv_v2f16_arcp_ulp25(<2 x half> %a, <2 x half> %b) { +; GFX6-IEEE-LABEL: v_fdiv_v2f16_arcp_ulp25: ; GFX6-IEEE: ; %bb.0: ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 @@ -1020,7 +1119,7 @@ define <2 x half> @v_fdiv_v2f16_afn_ulp25(<2 x half> %a, <2 x half> %b) { ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-FLUSH-LABEL: v_fdiv_v2f16_afn_ulp25: +; GFX6-FLUSH-LABEL: v_fdiv_v2f16_arcp_ulp25: ; GFX6-FLUSH: ; %bb.0: ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 @@ -1058,69 +1157,48 @@ define <2 x half> @v_fdiv_v2f16_afn_ulp25(<2 x half> %a, <2 x half> %b) { ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: v_fdiv_v2f16_afn_ulp25: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_rcp_f16_e32 v2, v1 -; GFX8-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX8-NEXT: v_mul_f16_e32 v2, v0, v2 -; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_mov_b32_e32 v1, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: v_fdiv_v2f16_afn_ulp25: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_rcp_f16_e32 v2, v1 -; GFX9-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-NEXT: v_mul_f16_e32 v2, v0, v2 -; GFX9-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff -; GFX9-NEXT: v_and_or_b32 v0, v2, v1, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] - %fdiv = fdiv afn <2 x half> %a, %b, !fpmath !0 - ret <2 x half> %fdiv -} - -define <2 x half> @v_fdiv_v2f16_arcp_ulp25(<2 x half> %a, <2 x half> %b) { -; GFX6-LABEL: v_fdiv_v2f16_arcp_ulp25: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_rcp_f32_e32 v2, v2 -; GFX6-NEXT: v_rcp_f32_e32 v3, v3 -; GFX6-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX6-NEXT: v_mul_f32_e32 v1, v1, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: s_setpc_b64 s[30:31] -; ; GFX8-LABEL: v_fdiv_v2f16_arcp_ulp25: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_rcp_f16_e32 v2, v1 -; GFX8-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX8-NEXT: v_mul_f16_e32 v2, v0, v2 -; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_mov_b32_e32 v1, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX8-NEXT: v_cvt_f32_f16_e32 v5, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX8-NEXT: v_rcp_f32_e32 v2, v2 +; GFX8-NEXT: v_cvt_f32_f16_e32 v7, v6 +; GFX8-NEXT: v_rcp_f32_e32 v5, v5 +; GFX8-NEXT: v_mul_f32_e32 v2, v3, v2 +; GFX8-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX8-NEXT: v_mul_f32_e32 v3, v7, v5 +; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX8-NEXT: v_div_fixup_f16 v0, v2, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, 16 +; GFX8-NEXT: v_div_fixup_f16 v1, v3, v4, v6 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fdiv_v2f16_arcp_ulp25: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_rcp_f16_e32 v2, v1 -; GFX9-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-NEXT: v_mul_f16_e32 v2, v0, v2 -; GFX9-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff -; GFX9-NEXT: v_and_or_b32 v0, v2, v1, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX9-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX9-NEXT: v_rcp_f32_e32 v2, v2 +; GFX9-NEXT: v_cvt_f32_f16_e32 v7, v6 +; GFX9-NEXT: v_rcp_f32_e32 v5, v5 +; GFX9-NEXT: v_mul_f32_e32 v2, v3, v2 +; GFX9-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX9-NEXT: v_mul_f32_e32 v3, v7, v5 +; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX9-NEXT: v_div_fixup_f16 v0, v2, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX9-NEXT: v_div_fixup_f16 v1, v3, v4, v6 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv arcp <2 x half> %a, %b, !fpmath !0 ret <2 x half> %fdiv diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll index 02114a058c89..a29c96b93f56 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll @@ -157,11 +157,23 @@ define float @v_rcp_f32(float %x) { ; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] ; -; GCN-FLUSH-LABEL: v_rcp_f32: -; GCN-FLUSH: ; %bb.0: -; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-FLUSH-NEXT: v_rcp_f32_e32 v0, v0 -; GCN-FLUSH-NEXT: s_setpc_b64 s[30:31] +; GFX6-FLUSH-LABEL: v_rcp_f32: +; GFX6-FLUSH: ; %bb.0: +; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 +; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3 +; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 +; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; ; GFX89-IEEE-LABEL: v_rcp_f32: ; GFX89-IEEE: ; %bb.0: @@ -178,16 +190,96 @@ define float @v_rcp_f32(float %x) { ; GFX89-IEEE-NEXT: v_div_fmas_f32 v1, v1, v3, v4 ; GFX89-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 ; GFX89-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX89-FLUSH-LABEL: v_rcp_f32: +; GFX89-FLUSH: ; %bb.0: +; GFX89-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX89-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; GFX89-FLUSH-NEXT: v_div_scale_f32 v2, vcc, 1.0, v0, 1.0 +; GFX89-FLUSH-NEXT: v_rcp_f32_e32 v3, v1 +; GFX89-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; GFX89-FLUSH-NEXT: v_fma_f32 v4, -v1, v3, 1.0 +; GFX89-FLUSH-NEXT: v_fma_f32 v3, v4, v3, v3 +; GFX89-FLUSH-NEXT: v_mul_f32_e32 v4, v2, v3 +; GFX89-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v2 +; GFX89-FLUSH-NEXT: v_fma_f32 v4, v5, v3, v4 +; GFX89-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v2 +; GFX89-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; GFX89-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v3, v4 +; GFX89-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 +; GFX89-FLUSH-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv float 1.0, %x ret float %fdiv } define float @v_rcp_f32_arcp(float %x) { -; GCN-LABEL: v_rcp_f32_arcp: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_rcp_f32_e32 v0, v0 -; GCN-NEXT: s_setpc_b64 s[30:31] +; GFX6-IEEE-LABEL: v_rcp_f32_arcp: +; GFX6-IEEE: ; %bb.0: +; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1 +; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2 +; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3 +; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 +; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-FLUSH-LABEL: v_rcp_f32_arcp: +; GFX6-FLUSH: ; %bb.0: +; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 +; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3 +; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 +; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] +; +; GFX89-IEEE-LABEL: v_rcp_f32_arcp: +; GFX89-IEEE: ; %bb.0: +; GFX89-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX89-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; GFX89-IEEE-NEXT: v_div_scale_f32 v2, vcc, 1.0, v0, 1.0 +; GFX89-IEEE-NEXT: v_rcp_f32_e32 v3, v1 +; GFX89-IEEE-NEXT: v_fma_f32 v4, -v1, v3, 1.0 +; GFX89-IEEE-NEXT: v_fma_f32 v3, v4, v3, v3 +; GFX89-IEEE-NEXT: v_mul_f32_e32 v4, v2, v3 +; GFX89-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v2 +; GFX89-IEEE-NEXT: v_fma_f32 v4, v5, v3, v4 +; GFX89-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v2 +; GFX89-IEEE-NEXT: v_div_fmas_f32 v1, v1, v3, v4 +; GFX89-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 +; GFX89-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX89-FLUSH-LABEL: v_rcp_f32_arcp: +; GFX89-FLUSH: ; %bb.0: +; GFX89-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX89-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; GFX89-FLUSH-NEXT: v_div_scale_f32 v2, vcc, 1.0, v0, 1.0 +; GFX89-FLUSH-NEXT: v_rcp_f32_e32 v3, v1 +; GFX89-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; GFX89-FLUSH-NEXT: v_fma_f32 v4, -v1, v3, 1.0 +; GFX89-FLUSH-NEXT: v_fma_f32 v3, v4, v3, v3 +; GFX89-FLUSH-NEXT: v_mul_f32_e32 v4, v2, v3 +; GFX89-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v2 +; GFX89-FLUSH-NEXT: v_fma_f32 v4, v5, v3, v4 +; GFX89-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v2 +; GFX89-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; GFX89-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v3, v4 +; GFX89-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 +; GFX89-FLUSH-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv arcp float 1.0, %x ret float %fdiv } @@ -237,12 +329,21 @@ define float @v_fdiv_f32_afn_ulp25(float %a, float %b) { } define float @v_fdiv_f32_arcp_ulp25(float %a, float %b) { -; GCN-IEEE-LABEL: v_fdiv_f32_arcp_ulp25: -; GCN-IEEE: ; %bb.0: -; GCN-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-IEEE-NEXT: v_rcp_f32_e32 v1, v1 -; GCN-IEEE-NEXT: v_mul_f32_e32 v0, v0, v1 -; GCN-IEEE-NEXT: s_setpc_b64 s[30:31] +; GFX6-IEEE-LABEL: v_fdiv_f32_arcp_ulp25: +; GFX6-IEEE: ; %bb.0: +; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 +; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 +; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5 +; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, v0 +; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] ; ; GCN-FLUSH-LABEL: v_fdiv_f32_arcp_ulp25: ; GCN-FLUSH: ; %bb.0: @@ -256,6 +357,22 @@ define float @v_fdiv_f32_arcp_ulp25(float %a, float %b) { ; GCN-FLUSH-NEXT: v_mul_f32_e32 v0, v0, v1 ; GCN-FLUSH-NEXT: v_mul_f32_e32 v0, v2, v0 ; GCN-FLUSH-NEXT: s_setpc_b64 s[30:31] +; +; GFX89-IEEE-LABEL: v_fdiv_f32_arcp_ulp25: +; GFX89-IEEE: ; %bb.0: +; GFX89-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX89-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 +; GFX89-IEEE-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0 +; GFX89-IEEE-NEXT: v_rcp_f32_e32 v4, v2 +; GFX89-IEEE-NEXT: v_fma_f32 v5, -v2, v4, 1.0 +; GFX89-IEEE-NEXT: v_fma_f32 v4, v5, v4, v4 +; GFX89-IEEE-NEXT: v_mul_f32_e32 v5, v3, v4 +; GFX89-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v3 +; GFX89-IEEE-NEXT: v_fma_f32 v5, v6, v4, v5 +; GFX89-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v3 +; GFX89-IEEE-NEXT: v_div_fmas_f32 v2, v2, v4, v5 +; GFX89-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, v0 +; GFX89-IEEE-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv arcp float %a, %b, !fpmath !0 ret float %fdiv } @@ -500,12 +617,36 @@ define <2 x float> @v_rcp_v2f32(<2 x float> %x) { ; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] ; -; GCN-FLUSH-LABEL: v_rcp_v2f32: -; GCN-FLUSH: ; %bb.0: -; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-FLUSH-NEXT: v_rcp_f32_e32 v0, v0 -; GCN-FLUSH-NEXT: v_rcp_f32_e32 v1, v1 -; GCN-FLUSH-NEXT: s_setpc_b64 s[30:31] +; GFX6-FLUSH-LABEL: v_rcp_v2f32: +; GFX6-FLUSH: ; %bb.0: +; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0 +; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0 +; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, 1.0 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 +; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, vcc, 1.0, v1, 1.0 +; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v3, v4, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v4, v4 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v2, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v5, v2 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v4, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v3, v5, v2 +; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v4, v5 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0 +; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; ; GFX89-IEEE-LABEL: v_rcp_v2f32: ; GFX89-IEEE: ; %bb.0: @@ -534,17 +675,158 @@ define <2 x float> @v_rcp_v2f32(<2 x float> %x) { ; GFX89-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 ; GFX89-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, 1.0 ; GFX89-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX89-FLUSH-LABEL: v_rcp_v2f32: +; GFX89-FLUSH: ; %bb.0: +; GFX89-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX89-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0 +; GFX89-FLUSH-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 +; GFX89-FLUSH-NEXT: v_rcp_f32_e32 v4, v2 +; GFX89-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; GFX89-FLUSH-NEXT: v_fma_f32 v5, -v2, v4, 1.0 +; GFX89-FLUSH-NEXT: v_fma_f32 v4, v5, v4, v4 +; GFX89-FLUSH-NEXT: v_mul_f32_e32 v5, v3, v4 +; GFX89-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v3 +; GFX89-FLUSH-NEXT: v_fma_f32 v5, v6, v4, v5 +; GFX89-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v3 +; GFX89-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; GFX89-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v4, v5 +; GFX89-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, 1.0 +; GFX89-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0 +; GFX89-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 +; GFX89-FLUSH-NEXT: v_rcp_f32_e32 v5, v3 +; GFX89-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; GFX89-FLUSH-NEXT: v_fma_f32 v2, -v3, v5, 1.0 +; GFX89-FLUSH-NEXT: v_fma_f32 v2, v2, v5, v5 +; GFX89-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v2 +; GFX89-FLUSH-NEXT: v_fma_f32 v6, -v3, v5, v4 +; GFX89-FLUSH-NEXT: v_fma_f32 v5, v6, v2, v5 +; GFX89-FLUSH-NEXT: v_fma_f32 v3, -v3, v5, v4 +; GFX89-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; GFX89-FLUSH-NEXT: v_div_fmas_f32 v2, v3, v2, v5 +; GFX89-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0 +; GFX89-FLUSH-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv <2 x float> <float 1.0, float 1.0>, %x ret <2 x float> %fdiv } define <2 x float> @v_rcp_v2f32_arcp(<2 x float> %x) { -; GCN-LABEL: v_rcp_v2f32_arcp: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_rcp_f32_e32 v0, v0 -; GCN-NEXT: v_rcp_f32_e32 v1, v1 -; GCN-NEXT: s_setpc_b64 s[30:31] +; GFX6-IEEE-LABEL: v_rcp_v2f32_arcp: +; GFX6-IEEE: ; %bb.0: +; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 +; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5 +; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, 1.0 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v2, vcc, 1.0, v1, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v5, -v3, v4, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v4, v4 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v2, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v5, v2 +; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v4, v5 +; GFX6-IEEE-NEXT: v_fma_f32 v2, -v3, v5, v2 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v4, v5 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0 +; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-FLUSH-LABEL: v_rcp_v2f32_arcp: +; GFX6-FLUSH: ; %bb.0: +; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0 +; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0 +; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, 1.0 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 +; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, vcc, 1.0, v1, 1.0 +; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v3, v4, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v4, v4 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v2, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v5, v2 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v4, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v3, v5, v2 +; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v4, v5 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0 +; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] +; +; GFX89-IEEE-LABEL: v_rcp_v2f32_arcp: +; GFX89-IEEE: ; %bb.0: +; GFX89-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX89-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0 +; GFX89-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, 1.0 +; GFX89-IEEE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0 +; GFX89-IEEE-NEXT: v_div_scale_f32 v5, s[4:5], 1.0, v1, 1.0 +; GFX89-IEEE-NEXT: v_rcp_f32_e32 v6, v2 +; GFX89-IEEE-NEXT: v_rcp_f32_e32 v7, v3 +; GFX89-IEEE-NEXT: v_fma_f32 v8, -v2, v6, 1.0 +; GFX89-IEEE-NEXT: v_fma_f32 v9, -v3, v7, 1.0 +; GFX89-IEEE-NEXT: v_fma_f32 v6, v8, v6, v6 +; GFX89-IEEE-NEXT: v_fma_f32 v7, v9, v7, v7 +; GFX89-IEEE-NEXT: v_mul_f32_e32 v8, v4, v6 +; GFX89-IEEE-NEXT: v_fma_f32 v10, -v2, v8, v4 +; GFX89-IEEE-NEXT: v_mul_f32_e32 v9, v5, v7 +; GFX89-IEEE-NEXT: v_fma_f32 v11, -v3, v9, v5 +; GFX89-IEEE-NEXT: v_fma_f32 v8, v10, v6, v8 +; GFX89-IEEE-NEXT: v_fma_f32 v2, -v2, v8, v4 +; GFX89-IEEE-NEXT: v_fma_f32 v9, v11, v7, v9 +; GFX89-IEEE-NEXT: v_div_fmas_f32 v2, v2, v6, v8 +; GFX89-IEEE-NEXT: v_fma_f32 v3, -v3, v9, v5 +; GFX89-IEEE-NEXT: s_mov_b64 vcc, s[4:5] +; GFX89-IEEE-NEXT: v_div_fmas_f32 v3, v3, v7, v9 +; GFX89-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 +; GFX89-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, 1.0 +; GFX89-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX89-FLUSH-LABEL: v_rcp_v2f32_arcp: +; GFX89-FLUSH: ; %bb.0: +; GFX89-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX89-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0 +; GFX89-FLUSH-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 +; GFX89-FLUSH-NEXT: v_rcp_f32_e32 v4, v2 +; GFX89-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; GFX89-FLUSH-NEXT: v_fma_f32 v5, -v2, v4, 1.0 +; GFX89-FLUSH-NEXT: v_fma_f32 v4, v5, v4, v4 +; GFX89-FLUSH-NEXT: v_mul_f32_e32 v5, v3, v4 +; GFX89-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v3 +; GFX89-FLUSH-NEXT: v_fma_f32 v5, v6, v4, v5 +; GFX89-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v3 +; GFX89-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; GFX89-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v4, v5 +; GFX89-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, 1.0 +; GFX89-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0 +; GFX89-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 +; GFX89-FLUSH-NEXT: v_rcp_f32_e32 v5, v3 +; GFX89-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; GFX89-FLUSH-NEXT: v_fma_f32 v2, -v3, v5, 1.0 +; GFX89-FLUSH-NEXT: v_fma_f32 v2, v2, v5, v5 +; GFX89-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v2 +; GFX89-FLUSH-NEXT: v_fma_f32 v6, -v3, v5, v4 +; GFX89-FLUSH-NEXT: v_fma_f32 v5, v6, v2, v5 +; GFX89-FLUSH-NEXT: v_fma_f32 v3, -v3, v5, v4 +; GFX89-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; GFX89-FLUSH-NEXT: v_div_fmas_f32 v2, v3, v2, v5 +; GFX89-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0 +; GFX89-FLUSH-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv arcp <2 x float> <float 1.0, float 1.0>, %x ret <2 x float> %fdiv } @@ -604,14 +886,32 @@ define <2 x float> @v_fdiv_v2f32_afn_ulp25(<2 x float> %a, <2 x float> %b) { } define <2 x float> @v_fdiv_v2f32_arcp_ulp25(<2 x float> %a, <2 x float> %b) { -; GCN-IEEE-LABEL: v_fdiv_v2f32_arcp_ulp25: -; GCN-IEEE: ; %bb.0: -; GCN-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-IEEE-NEXT: v_rcp_f32_e32 v2, v2 -; GCN-IEEE-NEXT: v_rcp_f32_e32 v3, v3 -; GCN-IEEE-NEXT: v_mul_f32_e32 v0, v0, v2 -; GCN-IEEE-NEXT: v_mul_f32_e32 v1, v1, v3 -; GCN-IEEE-NEXT: s_setpc_b64 s[30:31] +; GFX6-IEEE-LABEL: v_fdiv_v2f32_arcp_ulp25: +; GFX6-IEEE: ; %bb.0: +; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-IEEE-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v5, v4 +; GFX6-IEEE-NEXT: v_div_scale_f32 v6, vcc, v0, v2, v0 +; GFX6-IEEE-NEXT: v_fma_f32 v7, -v4, v5, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v5, v7, v5, v5 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v7, v6, v5 +; GFX6-IEEE-NEXT: v_fma_f32 v8, -v4, v7, v6 +; GFX6-IEEE-NEXT: v_fma_f32 v7, v8, v5, v7 +; GFX6-IEEE-NEXT: v_fma_f32 v4, -v4, v7, v6 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v4, v4, v5, v7 +; GFX6-IEEE-NEXT: v_div_scale_f32 v5, s[4:5], v3, v3, v1 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v6, v5 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v2, vcc, v1, v3, v1 +; GFX6-IEEE-NEXT: v_fma_f32 v4, -v5, v6, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v4, v4, v6, v6 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v2, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v7, -v5, v6, v2 +; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6 +; GFX6-IEEE-NEXT: v_fma_f32 v2, -v5, v6, v2 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v4, v6 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v3, v1 +; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] ; ; GCN-FLUSH-LABEL: v_fdiv_v2f32_arcp_ulp25: ; GCN-FLUSH: ; %bb.0: @@ -631,6 +931,34 @@ define <2 x float> @v_fdiv_v2f32_arcp_ulp25(<2 x float> %a, <2 x float> %b) { ; GCN-FLUSH-NEXT: v_mul_f32_e32 v0, v5, v0 ; GCN-FLUSH-NEXT: v_mul_f32_e32 v1, v4, v1 ; GCN-FLUSH-NEXT: s_setpc_b64 s[30:31] +; +; GFX89-IEEE-LABEL: v_fdiv_v2f32_arcp_ulp25: +; GFX89-IEEE: ; %bb.0: +; GFX89-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX89-IEEE-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0 +; GFX89-IEEE-NEXT: v_div_scale_f32 v5, s[4:5], v3, v3, v1 +; GFX89-IEEE-NEXT: v_div_scale_f32 v6, vcc, v0, v2, v0 +; GFX89-IEEE-NEXT: v_div_scale_f32 v7, s[4:5], v1, v3, v1 +; GFX89-IEEE-NEXT: v_rcp_f32_e32 v8, v4 +; GFX89-IEEE-NEXT: v_rcp_f32_e32 v9, v5 +; GFX89-IEEE-NEXT: v_fma_f32 v10, -v4, v8, 1.0 +; GFX89-IEEE-NEXT: v_fma_f32 v11, -v5, v9, 1.0 +; GFX89-IEEE-NEXT: v_fma_f32 v8, v10, v8, v8 +; GFX89-IEEE-NEXT: v_fma_f32 v9, v11, v9, v9 +; GFX89-IEEE-NEXT: v_mul_f32_e32 v10, v6, v8 +; GFX89-IEEE-NEXT: v_fma_f32 v12, -v4, v10, v6 +; GFX89-IEEE-NEXT: v_mul_f32_e32 v11, v7, v9 +; GFX89-IEEE-NEXT: v_fma_f32 v13, -v5, v11, v7 +; GFX89-IEEE-NEXT: v_fma_f32 v10, v12, v8, v10 +; GFX89-IEEE-NEXT: v_fma_f32 v4, -v4, v10, v6 +; GFX89-IEEE-NEXT: v_fma_f32 v11, v13, v9, v11 +; GFX89-IEEE-NEXT: v_div_fmas_f32 v4, v4, v8, v10 +; GFX89-IEEE-NEXT: v_fma_f32 v5, -v5, v11, v7 +; GFX89-IEEE-NEXT: s_mov_b64 vcc, s[4:5] +; GFX89-IEEE-NEXT: v_div_fmas_f32 v5, v5, v9, v11 +; GFX89-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0 +; GFX89-IEEE-NEXT: v_div_fixup_f32 v1, v5, v3, v1 +; GFX89-IEEE-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv arcp <2 x float> %a, %b, !fpmath !0 ret <2 x float> %fdiv } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll index 3b9e56b1a742..b7546223f7d9 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll @@ -410,21 +410,12 @@ define amdgpu_kernel void @fast_frem_f64(double addrspace(1)* %out, double addrs ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s2 -; CI-NEXT: v_mov_b32_e32 v1, s3 -; CI-NEXT: v_div_scale_f64 v[2:3], s[2:3], v[0:1], v[0:1], s[0:1] -; CI-NEXT: v_div_scale_f64 v[8:9], vcc, s[0:1], v[0:1], s[0:1] -; CI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] -; CI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 -; CI-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] -; CI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 -; CI-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] -; CI-NEXT: v_mul_f64 v[6:7], v[8:9], v[4:5] -; CI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9] -; CI-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7] -; CI-NEXT: v_div_fixup_f64 v[2:3], v[2:3], v[0:1], s[0:1] -; CI-NEXT: v_trunc_f64_e32 v[2:3], v[2:3] -; CI-NEXT: v_fma_f64 v[0:1], -v[2:3], v[0:1], s[0:1] +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_rcp_f64_e32 v[0:1], s[2:3] +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: v_mul_f64 v[0:1], s[0:1], v[0:1] +; CI-NEXT: v_trunc_f64_e32 v[0:1], v[0:1] +; CI-NEXT: v_fma_f64 v[0:1], -v[0:1], s[2:3], v[2:3] ; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -438,21 +429,12 @@ define amdgpu_kernel void @fast_frem_f64(double addrspace(1)* %out, double addrs ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_div_scale_f64 v[2:3], s[2:3], v[0:1], v[0:1], s[0:1] -; VI-NEXT: v_div_scale_f64 v[8:9], vcc, s[0:1], v[0:1], s[0:1] -; VI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] -; VI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 -; VI-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] -; VI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 -; VI-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] -; VI-NEXT: v_mul_f64 v[6:7], v[8:9], v[4:5] -; VI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9] -; VI-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7] -; VI-NEXT: v_div_fixup_f64 v[2:3], v[2:3], v[0:1], s[0:1] -; VI-NEXT: v_trunc_f64_e32 v[2:3], v[2:3] -; VI-NEXT: v_fma_f64 v[0:1], -v[2:3], v[0:1], s[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_rcp_f64_e32 v[0:1], s[2:3] +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mul_f64 v[0:1], s[0:1], v[0:1] +; VI-NEXT: v_trunc_f64_e32 v[0:1], v[0:1] +; VI-NEXT: v_fma_f64 v[0:1], -v[0:1], s[2:3], v[2:3] ; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fdiv.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fdiv.mir index 9bef474d08e3..4554bc81ef93 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fdiv.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fdiv.mir @@ -308,21 +308,60 @@ body: | ; SI-LABEL: name: test_fdiv_s32_denorms_off_arcp ; SI: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; SI: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; SI: [[INT:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[COPY1]](s32) - ; SI: [[FMUL:%[0-9]+]]:_(s32) = arcp G_FMUL [[COPY]], [[INT]] - ; SI: $vgpr0 = COPY [[FMUL]](s32) + ; SI: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 + ; SI: [[INT:%[0-9]+]]:_(s32), [[INT1:%[0-9]+]]:_(s1) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.scale), [[COPY]](s32), [[COPY1]](s32), 0 + ; SI: [[INT2:%[0-9]+]]:_(s32), [[INT3:%[0-9]+]]:_(s1) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.scale), [[COPY]](s32), [[COPY1]](s32), 1 + ; SI: [[INT4:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[INT]](s32) + ; SI: [[FNEG:%[0-9]+]]:_(s32) = arcp G_FNEG [[INT]] + ; SI: S_SETREG_IMM32_B32 3, 2305, implicit-def $mode, implicit $mode + ; SI: [[FMA:%[0-9]+]]:_(s32) = arcp G_FMA [[FNEG]], [[INT4]], [[C]] + ; SI: [[FMA1:%[0-9]+]]:_(s32) = arcp G_FMA [[FMA]], [[INT4]], [[INT4]] + ; SI: [[FMUL:%[0-9]+]]:_(s32) = arcp G_FMUL [[INT2]], [[FMA1]] + ; SI: [[FMA2:%[0-9]+]]:_(s32) = arcp G_FMA [[FNEG]], [[FMUL]], [[INT2]] + ; SI: [[FMA3:%[0-9]+]]:_(s32) = arcp G_FMA [[FMA2]], [[FMA1]], [[FMUL]] + ; SI: [[FMA4:%[0-9]+]]:_(s32) = arcp G_FMA [[FNEG]], [[FMA3]], [[INT2]] + ; SI: S_SETREG_IMM32_B32 0, 2305, implicit-def $mode, implicit $mode + ; SI: [[INT5:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s32), [[FMA1]](s32), [[FMA3]](s32), [[INT3]](s1) + ; SI: [[INT6:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s32), [[COPY1]](s32), [[COPY]](s32) + ; SI: $vgpr0 = COPY [[INT6]](s32) ; VI-LABEL: name: test_fdiv_s32_denorms_off_arcp ; VI: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; VI: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; VI: [[INT:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[COPY1]](s32) - ; VI: [[FMUL:%[0-9]+]]:_(s32) = arcp G_FMUL [[COPY]], [[INT]] - ; VI: $vgpr0 = COPY [[FMUL]](s32) + ; VI: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 + ; VI: [[INT:%[0-9]+]]:_(s32), [[INT1:%[0-9]+]]:_(s1) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.scale), [[COPY]](s32), [[COPY1]](s32), 0 + ; VI: [[INT2:%[0-9]+]]:_(s32), [[INT3:%[0-9]+]]:_(s1) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.scale), [[COPY]](s32), [[COPY1]](s32), 1 + ; VI: [[INT4:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[INT]](s32) + ; VI: [[FNEG:%[0-9]+]]:_(s32) = arcp G_FNEG [[INT]] + ; VI: S_SETREG_IMM32_B32 3, 2305, implicit-def $mode, implicit $mode + ; VI: [[FMA:%[0-9]+]]:_(s32) = arcp G_FMA [[FNEG]], [[INT4]], [[C]] + ; VI: [[FMA1:%[0-9]+]]:_(s32) = arcp G_FMA [[FMA]], [[INT4]], [[INT4]] + ; VI: [[FMUL:%[0-9]+]]:_(s32) = arcp G_FMUL [[INT2]], [[FMA1]] + ; VI: [[FMA2:%[0-9]+]]:_(s32) = arcp G_FMA [[FNEG]], [[FMUL]], [[INT2]] + ; VI: [[FMA3:%[0-9]+]]:_(s32) = arcp G_FMA [[FMA2]], [[FMA1]], [[FMUL]] + ; VI: [[FMA4:%[0-9]+]]:_(s32) = arcp G_FMA [[FNEG]], [[FMA3]], [[INT2]] + ; VI: S_SETREG_IMM32_B32 0, 2305, implicit-def $mode, implicit $mode + ; VI: [[INT5:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s32), [[FMA1]](s32), [[FMA3]](s32), [[INT3]](s1) + ; VI: [[INT6:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s32), [[COPY1]](s32), [[COPY]](s32) + ; VI: $vgpr0 = COPY [[INT6]](s32) ; GFX9-LABEL: name: test_fdiv_s32_denorms_off_arcp ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9: [[INT:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[COPY1]](s32) - ; GFX9: [[FMUL:%[0-9]+]]:_(s32) = arcp G_FMUL [[COPY]], [[INT]] - ; GFX9: $vgpr0 = COPY [[FMUL]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 + ; GFX9: [[INT:%[0-9]+]]:_(s32), [[INT1:%[0-9]+]]:_(s1) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.scale), [[COPY]](s32), [[COPY1]](s32), 0 + ; GFX9: [[INT2:%[0-9]+]]:_(s32), [[INT3:%[0-9]+]]:_(s1) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.scale), [[COPY]](s32), [[COPY1]](s32), 1 + ; GFX9: [[INT4:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[INT]](s32) + ; GFX9: [[FNEG:%[0-9]+]]:_(s32) = arcp G_FNEG [[INT]] + ; GFX9: S_SETREG_IMM32_B32 3, 2305, implicit-def $mode, implicit $mode + ; GFX9: [[FMA:%[0-9]+]]:_(s32) = arcp G_FMA [[FNEG]], [[INT4]], [[C]] + ; GFX9: [[FMA1:%[0-9]+]]:_(s32) = arcp G_FMA [[FMA]], [[INT4]], [[INT4]] + ; GFX9: [[FMUL:%[0-9]+]]:_(s32) = arcp G_FMUL [[INT2]], [[FMA1]] + ; GFX9: [[FMA2:%[0-9]+]]:_(s32) = arcp G_FMA [[FNEG]], [[FMUL]], [[INT2]] + ; GFX9: [[FMA3:%[0-9]+]]:_(s32) = arcp G_FMA [[FMA2]], [[FMA1]], [[FMUL]] + ; GFX9: [[FMA4:%[0-9]+]]:_(s32) = arcp G_FMA [[FNEG]], [[FMA3]], [[INT2]] + ; GFX9: S_SETREG_IMM32_B32 0, 2305, implicit-def $mode, implicit $mode + ; GFX9: [[INT5:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s32), [[FMA1]](s32), [[FMA3]](s32), [[INT3]](s1) + ; GFX9: [[INT6:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s32), [[COPY1]](s32), [[COPY]](s32) + ; GFX9: $vgpr0 = COPY [[INT6]](s32) ; GFX9-UNSAFE-LABEL: name: test_fdiv_s32_denorms_off_arcp ; GFX9-UNSAFE: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX9-UNSAFE: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 @@ -332,9 +371,22 @@ body: | ; GFX10-LABEL: name: test_fdiv_s32_denorms_off_arcp ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10: [[INT:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[COPY1]](s32) - ; GFX10: [[FMUL:%[0-9]+]]:_(s32) = arcp G_FMUL [[COPY]], [[INT]] - ; GFX10: $vgpr0 = COPY [[FMUL]](s32) + ; GFX10: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 + ; GFX10: [[INT:%[0-9]+]]:_(s32), [[INT1:%[0-9]+]]:_(s1) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.scale), [[COPY]](s32), [[COPY1]](s32), 0 + ; GFX10: [[INT2:%[0-9]+]]:_(s32), [[INT3:%[0-9]+]]:_(s1) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.scale), [[COPY]](s32), [[COPY1]](s32), 1 + ; GFX10: [[INT4:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[INT]](s32) + ; GFX10: [[FNEG:%[0-9]+]]:_(s32) = arcp G_FNEG [[INT]] + ; GFX10: S_DENORM_MODE 15, implicit-def $mode, implicit $mode + ; GFX10: [[FMA:%[0-9]+]]:_(s32) = arcp G_FMA [[FNEG]], [[INT4]], [[C]] + ; GFX10: [[FMA1:%[0-9]+]]:_(s32) = arcp G_FMA [[FMA]], [[INT4]], [[INT4]] + ; GFX10: [[FMUL:%[0-9]+]]:_(s32) = arcp G_FMUL [[INT2]], [[FMA1]] + ; GFX10: [[FMA2:%[0-9]+]]:_(s32) = arcp G_FMA [[FNEG]], [[FMUL]], [[INT2]] + ; GFX10: [[FMA3:%[0-9]+]]:_(s32) = arcp G_FMA [[FMA2]], [[FMA1]], [[FMUL]] + ; GFX10: [[FMA4:%[0-9]+]]:_(s32) = arcp G_FMA [[FNEG]], [[FMA3]], [[INT2]] + ; GFX10: S_DENORM_MODE 12, implicit-def $mode, implicit $mode + ; GFX10: [[INT5:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s32), [[FMA1]](s32), [[FMA3]](s32), [[INT3]](s1) + ; GFX10: [[INT6:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s32), [[COPY1]](s32), [[COPY]](s32) + ; GFX10: $vgpr0 = COPY [[INT6]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 %2:_(s32) = arcp G_FDIV %0, %1 @@ -1898,16 +1950,28 @@ body: | ; SI: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FPTRUNC]](s16) ; SI: $vgpr0 = COPY [[ANYEXT]](s32) ; VI-LABEL: name: test_fdiv_s16_constant_one_rcp + ; VI: [[C:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH3C00 ; VI: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) - ; VI: [[INT:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[TRUNC]](s16) - ; VI: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT]](s16) + ; VI: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[C]](s16) + ; VI: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16) + ; VI: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT1]](s32) + ; VI: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT]], [[INT]] + ; VI: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL]](s32) + ; VI: [[INT1:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[TRUNC]](s16), [[C]](s16) + ; VI: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT1]](s16) ; VI: $vgpr0 = COPY [[ANYEXT]](s32) ; GFX9-LABEL: name: test_fdiv_s16_constant_one_rcp + ; GFX9: [[C:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH3C00 ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) - ; GFX9: [[INT:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[TRUNC]](s16) - ; GFX9: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT]](s16) + ; GFX9: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[C]](s16) + ; GFX9: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16) + ; GFX9: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT1]](s32) + ; GFX9: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT]], [[INT]] + ; GFX9: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL]](s32) + ; GFX9: [[INT1:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[TRUNC]](s16), [[C]](s16) + ; GFX9: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT1]](s16) ; GFX9: $vgpr0 = COPY [[ANYEXT]](s32) ; GFX9-UNSAFE-LABEL: name: test_fdiv_s16_constant_one_rcp ; GFX9-UNSAFE: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 @@ -1916,10 +1980,16 @@ body: | ; GFX9-UNSAFE: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT]](s16) ; GFX9-UNSAFE: $vgpr0 = COPY [[ANYEXT]](s32) ; GFX10-LABEL: name: test_fdiv_s16_constant_one_rcp + ; GFX10: [[C:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH3C00 ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX10: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) - ; GFX10: [[INT:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[TRUNC]](s16) - ; GFX10: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT]](s16) + ; GFX10: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[C]](s16) + ; GFX10: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16) + ; GFX10: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT1]](s32) + ; GFX10: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT]], [[INT]] + ; GFX10: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL]](s32) + ; GFX10: [[INT1:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[TRUNC]](s16), [[C]](s16) + ; GFX10: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT1]](s16) ; GFX10: $vgpr0 = COPY [[ANYEXT]](s32) %0:_(s16) = G_FCONSTANT half 1.0 %1:_(s32) = COPY $vgpr0 @@ -1958,18 +2028,28 @@ body: | ; SI: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FPTRUNC]](s16) ; SI: $vgpr0 = COPY [[ANYEXT]](s32) ; VI-LABEL: name: test_fdiv_s16_constant_negative_one_rcp + ; VI: [[C:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xHBC00 ; VI: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) - ; VI: [[FNEG:%[0-9]+]]:_(s16) = G_FNEG [[TRUNC]] - ; VI: [[INT:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FNEG]](s16) - ; VI: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT]](s16) + ; VI: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[C]](s16) + ; VI: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16) + ; VI: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT1]](s32) + ; VI: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT]], [[INT]] + ; VI: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL]](s32) + ; VI: [[INT1:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[TRUNC]](s16), [[C]](s16) + ; VI: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT1]](s16) ; VI: $vgpr0 = COPY [[ANYEXT]](s32) ; GFX9-LABEL: name: test_fdiv_s16_constant_negative_one_rcp + ; GFX9: [[C:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xHBC00 ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) - ; GFX9: [[FNEG:%[0-9]+]]:_(s16) = G_FNEG [[TRUNC]] - ; GFX9: [[INT:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FNEG]](s16) - ; GFX9: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT]](s16) + ; GFX9: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[C]](s16) + ; GFX9: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16) + ; GFX9: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT1]](s32) + ; GFX9: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT]], [[INT]] + ; GFX9: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL]](s32) + ; GFX9: [[INT1:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[TRUNC]](s16), [[C]](s16) + ; GFX9: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT1]](s16) ; GFX9: $vgpr0 = COPY [[ANYEXT]](s32) ; GFX9-UNSAFE-LABEL: name: test_fdiv_s16_constant_negative_one_rcp ; GFX9-UNSAFE: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 @@ -1979,11 +2059,16 @@ body: | ; GFX9-UNSAFE: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT]](s16) ; GFX9-UNSAFE: $vgpr0 = COPY [[ANYEXT]](s32) ; GFX10-LABEL: name: test_fdiv_s16_constant_negative_one_rcp + ; GFX10: [[C:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xHBC00 ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX10: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) - ; GFX10: [[FNEG:%[0-9]+]]:_(s16) = G_FNEG [[TRUNC]] - ; GFX10: [[INT:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FNEG]](s16) - ; GFX10: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT]](s16) + ; GFX10: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[C]](s16) + ; GFX10: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16) + ; GFX10: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT1]](s32) + ; GFX10: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT]], [[INT]] + ; GFX10: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL]](s32) + ; GFX10: [[INT1:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[TRUNC]](s16), [[C]](s16) + ; GFX10: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT1]](s16) ; GFX10: $vgpr0 = COPY [[ANYEXT]](s32) %0:_(s16) = G_FCONSTANT half -1.0 %1:_(s32) = COPY $vgpr0 _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits