arsenm created this revision. arsenm added reviewers: AMDGPU, foad, rampitec, Pierre-vh, cdevadas, jhuber6. Herald added subscribers: StephenFan, kerbowa, hiraditya, tpr, dstuttard, yaxunl, jvesely, kzhuravl. Herald added a project: All. arsenm requested review of this revision. Herald added a subscriber: wdng. Herald added a project: LLVM.
This will map directly to the hardware instruction which does not handle denormals for f32. This will allow moving the generic intrinsic to be lowered correctly. Also handles selecting the f16 version, but there's no reason to use it over the generic intrinsic. https://reviews.llvm.org/D152697 Files: clang/include/clang/Basic/BuiltinsAMDGPU.def clang/lib/CodeGen/CGBuiltin.cpp clang/test/CodeGenOpenCL/builtins-amdgcn.cl llvm/docs/ReleaseNotes.rst llvm/include/llvm/IR/IntrinsicsAMDGPU.td llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp llvm/lib/Target/AMDGPU/R600Instructions.td llvm/lib/Target/AMDGPU/SIISelLowering.cpp llvm/lib/Target/AMDGPU/VOP1Instructions.td llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll llvm/test/CodeGen/AMDGPU/llvm.amdgcn.log.ll
Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.log.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.log.ll @@ -0,0 +1,79 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,SDAG %s +; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GISEL %s + +define float @v_log_f32(float %src) { +; GCN-LABEL: v_log_f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_log_f32_e32 v0, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] + %log = call float @llvm.amdgcn.log.f32(float %src) + ret float %log +} + +define float @v_fabs_log_f32(float %src) { +; GCN-LABEL: v_fabs_log_f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_log_f32_e64 v0, |v0| +; GCN-NEXT: s_setpc_b64 s[30:31] + %fabs.src = call float @llvm.fabs.f32(float %src) + %log = call float @llvm.amdgcn.log.f32(float %fabs.src) + ret float %log +} + +define float @v_fneg_fabs_log_f32(float %src) { +; GCN-LABEL: v_fneg_fabs_log_f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_log_f32_e64 v0, -|v0| +; GCN-NEXT: s_setpc_b64 s[30:31] + %fabs.src = call float @llvm.fabs.f32(float %src) + %neg.fabs.src = fneg float %fabs.src + %log = call float @llvm.amdgcn.log.f32(float %neg.fabs.src) + ret float %log +} + +define half @v_log_f16(half %src) { +; GCN-LABEL: v_log_f16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_log_f16_e32 v0, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] + %log = call half @llvm.amdgcn.log.f16(half %src) + ret half %log +} + +define half @v_fabs_log_f16(half %src) { +; GCN-LABEL: v_fabs_log_f16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_log_f16_e64 v0, |v0| +; GCN-NEXT: s_setpc_b64 s[30:31] + %fabs.src = call half @llvm.fabs.f16(half %src) + %log = call half @llvm.amdgcn.log.f16(half %fabs.src) + ret half %log +} + +define half @v_fneg_fabs_log_f16(half %src) { +; GCN-LABEL: v_fneg_fabs_log_f16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_log_f16_e64 v0, -|v0| +; GCN-NEXT: s_setpc_b64 s[30:31] + %fabs.src = call half @llvm.fabs.f16(half %src) + %neg.fabs.src = fneg half %fabs.src + %log = call half @llvm.amdgcn.log.f16(half %neg.fabs.src) + ret half %log +} + +declare half @llvm.amdgcn.log.f16(half) #0 +declare float @llvm.amdgcn.log.f32(float) #0 +declare float @llvm.fabs.f32(float) #0 +declare half @llvm.fabs.f16(half) #0 + +attributes #0 = { nounwind readnone speculatable willreturn } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GISEL: {{.*}} +; SDAG: {{.*}} Index: llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll +++ llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll @@ -872,6 +872,16 @@ ret float %canonicalized } +; GCN-LABEL: {{^}}v_test_canonicalize_amdgcn_log: +; GCN: s_waitcnt +; GCN-NEXT: v_log_f32 +; GCN-NEXT: s_setpc_b64 +define float @v_test_canonicalize_amdgcn_log(float %a) { + %log = call float @llvm.amdgcn.log.f32(float %a) + %canonicalized = call float @llvm.canonicalize.f32(float %log) + ret float %canonicalized +} + ; Avoid failing the test on FreeBSD11.0 which will match the GCN-NOT: 1.0 ; in the .amd_amdgpu_isa "amdgcn-unknown-freebsd11.0--gfx802" directive ; GCN: .amd_amdgpu_isa @@ -900,6 +910,7 @@ declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #0 declare float @llvm.amdgcn.cubeid(float, float, float) #0 declare float @llvm.amdgcn.frexp.mant.f32(float) #0 +declare float @llvm.amdgcn.log.f32(float) #0 attributes #0 = { nounwind readnone } attributes #1 = { "no-nans-fp-math"="true" } Index: llvm/lib/Target/AMDGPU/VOP1Instructions.td =================================================================== --- llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -322,7 +322,7 @@ let TRANS = 1, SchedRW = [WriteTrans32] in { defm V_EXP_F32 : VOP1Inst <"v_exp_f32", VOP_F32_F32, fexp2>; -defm V_LOG_F32 : VOP1Inst <"v_log_f32", VOP_F32_F32, flog2>; +defm V_LOG_F32 : VOP1Inst <"v_log_f32", VOP_F32_F32, AMDGPUlog>; defm V_RCP_F32 : VOP1Inst <"v_rcp_f32", VOP_F32_F32, AMDGPUrcp>; defm V_RCP_IFLAG_F32 : VOP1Inst <"v_rcp_iflag_f32", VOP_F32_F32, AMDGPUrcp_iflag>; defm V_RSQ_F32 : VOP1Inst <"v_rsq_f32", VOP_F32_F32, AMDGPUrsq>; @@ -487,7 +487,7 @@ defm V_RCP_F16 : VOP1Inst_t16 <"v_rcp_f16", VOP_F16_F16, AMDGPUrcp>; defm V_SQRT_F16 : VOP1Inst_t16 <"v_sqrt_f16", VOP_F16_F16, any_amdgcn_sqrt>; defm V_RSQ_F16 : VOP1Inst_t16 <"v_rsq_f16", VOP_F16_F16, AMDGPUrsq>; -defm V_LOG_F16 : VOP1Inst_t16 <"v_log_f16", VOP_F16_F16, flog2>; +defm V_LOG_F16 : VOP1Inst_t16 <"v_log_f16", VOP_F16_F16, AMDGPUlogf16>; defm V_EXP_F16 : VOP1Inst_t16 <"v_exp_f16", VOP_F16_F16, fexp2>; defm V_SIN_F16 : VOP1Inst_t16 <"v_sin_f16", VOP_F16_F16, AMDGPUsin>; defm V_COS_F16 : VOP1Inst_t16 <"v_cos_f16", VOP_F16_F16, AMDGPUcos>; Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -10555,6 +10555,7 @@ case Intrinsic::amdgcn_rcp_legacy: case Intrinsic::amdgcn_rsq_legacy: case Intrinsic::amdgcn_trig_preop: + case Intrinsic::amdgcn_log: return true; default: break; Index: llvm/lib/Target/AMDGPU/R600Instructions.td =================================================================== --- llvm/lib/Target/AMDGPU/R600Instructions.td +++ llvm/lib/Target/AMDGPU/R600Instructions.td @@ -1124,7 +1124,7 @@ >; class LOG_IEEE_Common <bits<11> inst> : R600_1OP_Helper < - inst, "LOG_IEEE", flog2 + inst, "LOG_IEEE", AMDGPUlog > { let Itinerary = TransALU; } Index: llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -4207,6 +4207,7 @@ case Intrinsic::amdgcn_sin: case Intrinsic::amdgcn_cos: case Intrinsic::amdgcn_log_clamp: + case Intrinsic::amdgcn_log: case Intrinsic::amdgcn_rcp: case Intrinsic::amdgcn_rcp_legacy: case Intrinsic::amdgcn_sqrt: Index: llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -115,6 +115,9 @@ // out = 1.0 / a def AMDGPUrcp_impl : SDNode<"AMDGPUISD::RCP", SDTFPUnaryOp>; +// v_log_f32, which is log2 +def AMDGPUlog_impl : SDNode<"AMDGPUISD::LOG", SDTFPUnaryOp>; + // out = 1.0 / sqrt(a) def AMDGPUrsq_impl : SDNode<"AMDGPUISD::RSQ", SDTFPUnaryOp>; @@ -385,6 +388,12 @@ (AMDGPUcos_impl node:$src)]>; def AMDGPUfract : PatFrags<(ops node:$src), [(int_amdgcn_fract node:$src), (AMDGPUfract_impl node:$src)]>; +def AMDGPUlog : PatFrags<(ops node:$src), [(int_amdgcn_log node:$src), + (AMDGPUlog_impl node:$src), + (flog2 node:$src)]>; +def AMDGPUlogf16 : PatFrags<(ops node:$src), [(int_amdgcn_log node:$src), + (flog2 node:$src)]>; + def AMDGPUfp_class : PatFrags<(ops node:$src0, node:$src1), [(int_amdgcn_class node:$src0, node:$src1), Index: llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -444,6 +444,10 @@ RSQ, RCP_LEGACY, RCP_IFLAG, + + // log2, no denormal handling for f32. + LOG, + FMUL_LEGACY, RSQ_CLAMP, FP_CLASS, Index: llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -4687,6 +4687,7 @@ NODE_NAME_CASE(RSQ) NODE_NAME_CASE(RCP_LEGACY) NODE_NAME_CASE(RCP_IFLAG) + NODE_NAME_CASE(LOG) NODE_NAME_CASE(FMUL_LEGACY) NODE_NAME_CASE(RSQ_CLAMP) NODE_NAME_CASE(FP_CLASS) Index: llvm/include/llvm/IR/IntrinsicsAMDGPU.td =================================================================== --- llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -300,6 +300,14 @@ [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] >; +// v_log_{f16|f32}, performs log2. f32 version does not handle +// denormals. There is no reason to use this for f16 as it does +// support denormals, and the generic log intrinsic should be +// preferred. +def int_amdgcn_log : DefaultAttrsIntrinsic< + [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] +>; + def int_amdgcn_log_clamp : DefaultAttrsIntrinsic< [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] >; Index: llvm/docs/ReleaseNotes.rst =================================================================== --- llvm/docs/ReleaseNotes.rst +++ llvm/docs/ReleaseNotes.rst @@ -133,6 +133,9 @@ improves the interaction between AMDGPU buffer operations and the LLVM memory model, and so the non `.ptr` intrinsics are deprecated. +* Added llvm.amdgcn.log.f32 intrinsic. This provides direct access to + v_log_f32. + Changes to the ARM Backend -------------------------- Index: clang/test/CodeGenOpenCL/builtins-amdgcn.cl =================================================================== --- clang/test/CodeGenOpenCL/builtins-amdgcn.cl +++ clang/test/CodeGenOpenCL/builtins-amdgcn.cl @@ -172,6 +172,13 @@ *out = __builtin_amdgcn_cosf(a); } +// CHECK-LABEL: @test_log_f32 +// CHECK: call float @llvm.amdgcn.log.f32 +void test_log_f32(global float* out, float a) +{ + *out = __builtin_amdgcn_logf(a); +} + // CHECK-LABEL: @test_log_clamp_f32 // CHECK: call float @llvm.amdgcn.log.clamp.f32 void test_log_clamp_f32(global float* out, float a) Index: clang/lib/CodeGen/CGBuiltin.cpp =================================================================== --- clang/lib/CodeGen/CGBuiltin.cpp +++ clang/lib/CodeGen/CGBuiltin.cpp @@ -17171,6 +17171,8 @@ return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_cos); case AMDGPU::BI__builtin_amdgcn_dispatch_ptr: return EmitAMDGPUDispatchPtr(*this, E); + case AMDGPU::BI__builtin_amdgcn_logf: + return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_log); case AMDGPU::BI__builtin_amdgcn_log_clampf: return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_log_clamp); case AMDGPU::BI__builtin_amdgcn_ldexp: Index: clang/include/clang/Basic/BuiltinsAMDGPU.def =================================================================== --- clang/include/clang/Basic/BuiltinsAMDGPU.def +++ clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -100,6 +100,7 @@ BUILTIN(__builtin_amdgcn_rsq_clampf, "ff", "nc") BUILTIN(__builtin_amdgcn_sinf, "ff", "nc") BUILTIN(__builtin_amdgcn_cosf, "ff", "nc") +BUILTIN(__builtin_amdgcn_logf, "ff", "nc") BUILTIN(__builtin_amdgcn_log_clampf, "ff", "nc") BUILTIN(__builtin_amdgcn_ldexp, "ddi", "nc") BUILTIN(__builtin_amdgcn_ldexpf, "ffi", "nc")
_______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits