arsenm created this revision.
arsenm added reviewers: AMDGPU, foad, rampitec, Pierre-vh, cdevadas, jhuber6.
Herald added subscribers: StephenFan, kerbowa, hiraditya, tpr, dstuttard, 
yaxunl, jvesely, kzhuravl.
Herald added a project: All.
arsenm requested review of this revision.
Herald added a subscriber: wdng.
Herald added a project: LLVM.

This will map directly to the hardware instruction which does not
handle denormals for f32. This will allow moving the generic intrinsic
to be lowered correctly. Also handles selecting the f16 version, but
there's no reason to use it over the generic intrinsic.


https://reviews.llvm.org/D152697

Files:
  clang/include/clang/Basic/BuiltinsAMDGPU.def
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/test/CodeGenOpenCL/builtins-amdgcn.cl
  llvm/docs/ReleaseNotes.rst
  llvm/include/llvm/IR/IntrinsicsAMDGPU.td
  llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
  llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
  llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
  llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
  llvm/lib/Target/AMDGPU/R600Instructions.td
  llvm/lib/Target/AMDGPU/SIISelLowering.cpp
  llvm/lib/Target/AMDGPU/VOP1Instructions.td
  llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll
  llvm/test/CodeGen/AMDGPU/llvm.amdgcn.log.ll

Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.log.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.log.ll
@@ -0,0 +1,79 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GISEL %s
+
+define float @v_log_f32(float %src)  {
+; GCN-LABEL: v_log_f32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_log_f32_e32 v0, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %log = call float @llvm.amdgcn.log.f32(float %src)
+  ret float %log
+}
+
+define float @v_fabs_log_f32(float %src)  {
+; GCN-LABEL: v_fabs_log_f32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_log_f32_e64 v0, |v0|
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %fabs.src = call float @llvm.fabs.f32(float %src)
+  %log = call float @llvm.amdgcn.log.f32(float %fabs.src)
+  ret float %log
+}
+
+define float @v_fneg_fabs_log_f32(float %src)  {
+; GCN-LABEL: v_fneg_fabs_log_f32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_log_f32_e64 v0, -|v0|
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %fabs.src = call float @llvm.fabs.f32(float %src)
+  %neg.fabs.src = fneg float %fabs.src
+  %log = call float @llvm.amdgcn.log.f32(float %neg.fabs.src)
+  ret float %log
+}
+
+define half @v_log_f16(half %src)  {
+; GCN-LABEL: v_log_f16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_log_f16_e32 v0, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %log = call half @llvm.amdgcn.log.f16(half %src)
+  ret half %log
+}
+
+define half @v_fabs_log_f16(half %src)  {
+; GCN-LABEL: v_fabs_log_f16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_log_f16_e64 v0, |v0|
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %fabs.src = call half @llvm.fabs.f16(half %src)
+  %log = call half @llvm.amdgcn.log.f16(half %fabs.src)
+  ret half %log
+}
+
+define half @v_fneg_fabs_log_f16(half %src)  {
+; GCN-LABEL: v_fneg_fabs_log_f16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_log_f16_e64 v0, -|v0|
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %fabs.src = call half @llvm.fabs.f16(half %src)
+  %neg.fabs.src = fneg half %fabs.src
+  %log = call half @llvm.amdgcn.log.f16(half %neg.fabs.src)
+  ret half %log
+}
+
+declare half @llvm.amdgcn.log.f16(half) #0
+declare float @llvm.amdgcn.log.f32(float) #0
+declare float @llvm.fabs.f32(float) #0
+declare half @llvm.fabs.f16(half) #0
+
+attributes #0 = { nounwind readnone speculatable willreturn }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GISEL: {{.*}}
+; SDAG: {{.*}}
Index: llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll
+++ llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll
@@ -872,6 +872,16 @@
   ret float %canonicalized
 }
 
+; GCN-LABEL: {{^}}v_test_canonicalize_amdgcn_log:
+; GCN: s_waitcnt
+; GCN-NEXT: v_log_f32
+; GCN-NEXT: s_setpc_b64
+define float @v_test_canonicalize_amdgcn_log(float %a) {
+  %log = call float @llvm.amdgcn.log.f32(float %a)
+  %canonicalized = call float @llvm.canonicalize.f32(float %log)
+  ret float %canonicalized
+}
+
 ; Avoid failing the test on FreeBSD11.0 which will match the GCN-NOT: 1.0
 ; in the .amd_amdgpu_isa "amdgcn-unknown-freebsd11.0--gfx802" directive
 ; GCN: .amd_amdgpu_isa
@@ -900,6 +910,7 @@
 declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #0
 declare float @llvm.amdgcn.cubeid(float, float, float) #0
 declare float @llvm.amdgcn.frexp.mant.f32(float) #0
+declare float @llvm.amdgcn.log.f32(float) #0
 
 attributes #0 = { nounwind readnone }
 attributes #1 = { "no-nans-fp-math"="true" }
Index: llvm/lib/Target/AMDGPU/VOP1Instructions.td
===================================================================
--- llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -322,7 +322,7 @@
 
 let TRANS = 1, SchedRW = [WriteTrans32] in {
 defm V_EXP_F32 : VOP1Inst <"v_exp_f32", VOP_F32_F32, fexp2>;
-defm V_LOG_F32 : VOP1Inst <"v_log_f32", VOP_F32_F32, flog2>;
+defm V_LOG_F32 : VOP1Inst <"v_log_f32", VOP_F32_F32, AMDGPUlog>;
 defm V_RCP_F32 : VOP1Inst <"v_rcp_f32", VOP_F32_F32, AMDGPUrcp>;
 defm V_RCP_IFLAG_F32 : VOP1Inst <"v_rcp_iflag_f32", VOP_F32_F32, AMDGPUrcp_iflag>;
 defm V_RSQ_F32 : VOP1Inst <"v_rsq_f32", VOP_F32_F32, AMDGPUrsq>;
@@ -487,7 +487,7 @@
 defm V_RCP_F16 : VOP1Inst_t16 <"v_rcp_f16", VOP_F16_F16, AMDGPUrcp>;
 defm V_SQRT_F16 : VOP1Inst_t16 <"v_sqrt_f16", VOP_F16_F16, any_amdgcn_sqrt>;
 defm V_RSQ_F16 : VOP1Inst_t16 <"v_rsq_f16", VOP_F16_F16, AMDGPUrsq>;
-defm V_LOG_F16 : VOP1Inst_t16 <"v_log_f16", VOP_F16_F16, flog2>;
+defm V_LOG_F16 : VOP1Inst_t16 <"v_log_f16", VOP_F16_F16, AMDGPUlogf16>;
 defm V_EXP_F16 : VOP1Inst_t16 <"v_exp_f16", VOP_F16_F16, fexp2>;
 defm V_SIN_F16 : VOP1Inst_t16 <"v_sin_f16", VOP_F16_F16, AMDGPUsin>;
 defm V_COS_F16 : VOP1Inst_t16 <"v_cos_f16", VOP_F16_F16, AMDGPUcos>;
Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp
===================================================================
--- llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -10555,6 +10555,7 @@
     case Intrinsic::amdgcn_rcp_legacy:
     case Intrinsic::amdgcn_rsq_legacy:
     case Intrinsic::amdgcn_trig_preop:
+    case Intrinsic::amdgcn_log:
       return true;
     default:
       break;
Index: llvm/lib/Target/AMDGPU/R600Instructions.td
===================================================================
--- llvm/lib/Target/AMDGPU/R600Instructions.td
+++ llvm/lib/Target/AMDGPU/R600Instructions.td
@@ -1124,7 +1124,7 @@
 >;
 
 class LOG_IEEE_Common <bits<11> inst> : R600_1OP_Helper <
-  inst, "LOG_IEEE", flog2
+  inst, "LOG_IEEE", AMDGPUlog
 > {
   let Itinerary = TransALU;
 }
Index: llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
===================================================================
--- llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4207,6 +4207,7 @@
     case Intrinsic::amdgcn_sin:
     case Intrinsic::amdgcn_cos:
     case Intrinsic::amdgcn_log_clamp:
+    case Intrinsic::amdgcn_log:
     case Intrinsic::amdgcn_rcp:
     case Intrinsic::amdgcn_rcp_legacy:
     case Intrinsic::amdgcn_sqrt:
Index: llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
===================================================================
--- llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -115,6 +115,9 @@
 // out = 1.0 / a
 def AMDGPUrcp_impl : SDNode<"AMDGPUISD::RCP", SDTFPUnaryOp>;
 
+// v_log_f32, which is log2
+def AMDGPUlog_impl : SDNode<"AMDGPUISD::LOG", SDTFPUnaryOp>;
+
 // out = 1.0 / sqrt(a)
 def AMDGPUrsq_impl : SDNode<"AMDGPUISD::RSQ", SDTFPUnaryOp>;
 
@@ -385,6 +388,12 @@
                                            (AMDGPUcos_impl node:$src)]>;
 def AMDGPUfract : PatFrags<(ops node:$src), [(int_amdgcn_fract node:$src),
                                              (AMDGPUfract_impl node:$src)]>;
+def AMDGPUlog : PatFrags<(ops node:$src), [(int_amdgcn_log node:$src),
+                                           (AMDGPUlog_impl node:$src),
+                                           (flog2 node:$src)]>;
+def AMDGPUlogf16 : PatFrags<(ops node:$src), [(int_amdgcn_log node:$src),
+                                              (flog2 node:$src)]>;
+
 
 def AMDGPUfp_class : PatFrags<(ops node:$src0, node:$src1),
   [(int_amdgcn_class node:$src0, node:$src1),
Index: llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
===================================================================
--- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -444,6 +444,10 @@
   RSQ,
   RCP_LEGACY,
   RCP_IFLAG,
+
+  // log2, no denormal handling for f32.
+  LOG,
+
   FMUL_LEGACY,
   RSQ_CLAMP,
   FP_CLASS,
Index: llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
===================================================================
--- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -4687,6 +4687,7 @@
   NODE_NAME_CASE(RSQ)
   NODE_NAME_CASE(RCP_LEGACY)
   NODE_NAME_CASE(RCP_IFLAG)
+  NODE_NAME_CASE(LOG)
   NODE_NAME_CASE(FMUL_LEGACY)
   NODE_NAME_CASE(RSQ_CLAMP)
   NODE_NAME_CASE(FP_CLASS)
Index: llvm/include/llvm/IR/IntrinsicsAMDGPU.td
===================================================================
--- llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -300,6 +300,14 @@
   [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
 >;
 
+// v_log_{f16|f32}, performs log2. f32 version does not handle
+// denormals. There is no reason to use this for f16 as it does
+// support denormals, and the generic log intrinsic should be
+// preferred.
+def int_amdgcn_log : DefaultAttrsIntrinsic<
+  [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
+>;
+
 def int_amdgcn_log_clamp : DefaultAttrsIntrinsic<
   [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
 >;
Index: llvm/docs/ReleaseNotes.rst
===================================================================
--- llvm/docs/ReleaseNotes.rst
+++ llvm/docs/ReleaseNotes.rst
@@ -133,6 +133,9 @@
   improves the interaction between AMDGPU buffer operations and the LLVM memory
   model, and so the non `.ptr` intrinsics are deprecated.
 
+* Added llvm.amdgcn.log.f32 intrinsic. This provides direct access to
+  v_log_f32.
+
 Changes to the ARM Backend
 --------------------------
 
Index: clang/test/CodeGenOpenCL/builtins-amdgcn.cl
===================================================================
--- clang/test/CodeGenOpenCL/builtins-amdgcn.cl
+++ clang/test/CodeGenOpenCL/builtins-amdgcn.cl
@@ -172,6 +172,13 @@
   *out = __builtin_amdgcn_cosf(a);
 }
 
+// CHECK-LABEL: @test_log_f32
+// CHECK: call float @llvm.amdgcn.log.f32
+void test_log_f32(global float* out, float a)
+{
+  *out = __builtin_amdgcn_logf(a);
+}
+
 // CHECK-LABEL: @test_log_clamp_f32
 // CHECK: call float @llvm.amdgcn.log.clamp.f32
 void test_log_clamp_f32(global float* out, float a)
Index: clang/lib/CodeGen/CGBuiltin.cpp
===================================================================
--- clang/lib/CodeGen/CGBuiltin.cpp
+++ clang/lib/CodeGen/CGBuiltin.cpp
@@ -17171,6 +17171,8 @@
     return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_cos);
   case AMDGPU::BI__builtin_amdgcn_dispatch_ptr:
     return EmitAMDGPUDispatchPtr(*this, E);
+  case AMDGPU::BI__builtin_amdgcn_logf:
+    return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_log);
   case AMDGPU::BI__builtin_amdgcn_log_clampf:
     return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_log_clamp);
   case AMDGPU::BI__builtin_amdgcn_ldexp:
Index: clang/include/clang/Basic/BuiltinsAMDGPU.def
===================================================================
--- clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -100,6 +100,7 @@
 BUILTIN(__builtin_amdgcn_rsq_clampf, "ff", "nc")
 BUILTIN(__builtin_amdgcn_sinf, "ff", "nc")
 BUILTIN(__builtin_amdgcn_cosf, "ff", "nc")
+BUILTIN(__builtin_amdgcn_logf, "ff", "nc")
 BUILTIN(__builtin_amdgcn_log_clampf, "ff", "nc")
 BUILTIN(__builtin_amdgcn_ldexp, "ddi", "nc")
 BUILTIN(__builtin_amdgcn_ldexpf, "ffi", "nc")
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to