https://github.com/AlexMaclean updated https://github.com/llvm/llvm-project/pull/112834
>From 3c21269ad0b7be617b06cde5debe405f99ef17ef Mon Sep 17 00:00:00 2001 From: Alex Maclean <amacl...@nvidia.com> Date: Thu, 17 Oct 2024 16:49:24 +0000 Subject: [PATCH 1/2] [NVPTX] Remove nvvm.ldg.global.* intrinsics --- clang/lib/CodeGen/CGBuiltin.cpp | 45 +++-- .../builtins-nvptx-native-half-type-native.c | 4 +- .../CodeGen/builtins-nvptx-native-half-type.c | 4 +- clang/test/CodeGen/builtins-nvptx.c | 72 +++---- llvm/include/llvm/IR/IntrinsicsNVVM.td | 18 +- llvm/lib/IR/AutoUpgrade.cpp | 14 ++ llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 189 +++++++----------- llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 55 +---- llvm/lib/Target/NVPTX/NVPTXISelLowering.h | 2 - .../Assembler/auto_upgrade_nvvm_intrinsics.ll | 31 +++ 10 files changed, 188 insertions(+), 246 deletions(-) diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 1ad950798c2118..40a875ab29c900 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -20485,7 +20485,7 @@ static NVPTXMmaInfo getNVPTXMmaInfo(unsigned BuiltinID) { #undef MMA_VARIANTS_B1_XOR } -static Value *MakeLdgLdu(unsigned IntrinsicID, CodeGenFunction &CGF, +static Value *MakeLdu(unsigned IntrinsicID, CodeGenFunction &CGF, const CallExpr *E) { Value *Ptr = CGF.EmitScalarExpr(E->getArg(0)); QualType ArgType = E->getArg(0)->getType(); @@ -20496,6 +20496,21 @@ static Value *MakeLdgLdu(unsigned IntrinsicID, CodeGenFunction &CGF, {Ptr, ConstantInt::get(CGF.Builder.getInt32Ty(), Align.getQuantity())}); } +static Value *MakeLdg(CodeGenFunction &CGF, const CallExpr *E) { + Value *Ptr = CGF.EmitScalarExpr(E->getArg(0)); + QualType ArgType = E->getArg(0)->getType(); + clang::CharUnits AlignV = CGF.CGM.getNaturalPointeeTypeAlignment(ArgType); + llvm::Type *ElemTy = CGF.ConvertTypeForMem(ArgType->getPointeeType()); + + // Use addrspace(1) for NVPTX ADDRESS_SPACE_GLOBAL + auto *ASC = CGF.Builder.CreateAddrSpaceCast(Ptr, CGF.Builder.getPtrTy(1)); + auto *LD = CGF.Builder.CreateAlignedLoad(ElemTy, ASC, AlignV.getAsAlign()); + MDNode *MD = MDNode::get(CGF.Builder.getContext(), {}); + LD->setMetadata(LLVMContext::MD_invariant_load, MD); + + return LD; +} + static Value *MakeScopedAtomic(unsigned IntrinsicID, CodeGenFunction &CGF, const CallExpr *E) { Value *Ptr = CGF.EmitScalarExpr(E->getArg(0)); @@ -20529,9 +20544,11 @@ static Value *MakeHalfType(unsigned IntrinsicID, unsigned BuiltinID, return nullptr; } - if (IntrinsicID == Intrinsic::nvvm_ldg_global_f || - IntrinsicID == Intrinsic::nvvm_ldu_global_f) - return MakeLdgLdu(IntrinsicID, CGF, E); + if (BuiltinID == NVPTX::BI__nvvm_ldg_h || BuiltinID == NVPTX::BI__nvvm_ldg_h2) + return MakeLdg(CGF, E); + + if (IntrinsicID == Intrinsic::nvvm_ldu_global_f) + return MakeLdu(IntrinsicID, CGF, E); SmallVector<Value *, 16> Args; auto *F = CGF.CGM.getIntrinsic(IntrinsicID); @@ -20668,16 +20685,15 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID, case NVPTX::BI__nvvm_ldg_ul2: case NVPTX::BI__nvvm_ldg_ull: case NVPTX::BI__nvvm_ldg_ull2: - // PTX Interoperability section 2.2: "For a vector with an even number of - // elements, its alignment is set to number of elements times the alignment - // of its member: n*alignof(t)." - return MakeLdgLdu(Intrinsic::nvvm_ldg_global_i, *this, E); case NVPTX::BI__nvvm_ldg_f: case NVPTX::BI__nvvm_ldg_f2: case NVPTX::BI__nvvm_ldg_f4: case NVPTX::BI__nvvm_ldg_d: case NVPTX::BI__nvvm_ldg_d2: - return MakeLdgLdu(Intrinsic::nvvm_ldg_global_f, *this, E); + // PTX Interoperability section 2.2: "For a vector with an even number of + // elements, its alignment is set to number of elements times the alignment + // of its member: n*alignof(t)." + return MakeLdg(*this, E); case NVPTX::BI__nvvm_ldu_c: case NVPTX::BI__nvvm_ldu_sc: @@ -20708,13 +20724,13 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID, case NVPTX::BI__nvvm_ldu_ul2: case NVPTX::BI__nvvm_ldu_ull: case NVPTX::BI__nvvm_ldu_ull2: - return MakeLdgLdu(Intrinsic::nvvm_ldu_global_i, *this, E); + return MakeLdu(Intrinsic::nvvm_ldu_global_i, *this, E); case NVPTX::BI__nvvm_ldu_f: case NVPTX::BI__nvvm_ldu_f2: case NVPTX::BI__nvvm_ldu_f4: case NVPTX::BI__nvvm_ldu_d: case NVPTX::BI__nvvm_ldu_d2: - return MakeLdgLdu(Intrinsic::nvvm_ldu_global_f, *this, E); + return MakeLdu(Intrinsic::nvvm_ldu_global_f, *this, E); case NVPTX::BI__nvvm_atom_cta_add_gen_i: case NVPTX::BI__nvvm_atom_cta_add_gen_l: @@ -21188,14 +21204,11 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID, return MakeHalfType(Intrinsic::nvvm_fmin_xorsign_abs_f16x2, BuiltinID, E, *this); case NVPTX::BI__nvvm_ldg_h: - return MakeHalfType(Intrinsic::nvvm_ldg_global_f, BuiltinID, E, *this); case NVPTX::BI__nvvm_ldg_h2: - return MakeHalfType(Intrinsic::nvvm_ldg_global_f, BuiltinID, E, *this); + return MakeHalfType(Intrinsic::not_intrinsic, BuiltinID, E, *this); case NVPTX::BI__nvvm_ldu_h: + case NVPTX::BI__nvvm_ldu_h2: return MakeHalfType(Intrinsic::nvvm_ldu_global_f, BuiltinID, E, *this); - case NVPTX::BI__nvvm_ldu_h2: { - return MakeHalfType(Intrinsic::nvvm_ldu_global_f, BuiltinID, E, *this); - } case NVPTX::BI__nvvm_cp_async_ca_shared_global_4: return MakeCpAsync(Intrinsic::nvvm_cp_async_ca_shared_global_4, Intrinsic::nvvm_cp_async_ca_shared_global_4_s, *this, E, diff --git a/clang/test/CodeGen/builtins-nvptx-native-half-type-native.c b/clang/test/CodeGen/builtins-nvptx-native-half-type-native.c index b594fc876d4b9e..035c4c6066be24 100644 --- a/clang/test/CodeGen/builtins-nvptx-native-half-type-native.c +++ b/clang/test/CodeGen/builtins-nvptx-native-half-type-native.c @@ -52,8 +52,8 @@ typedef __fp16 __fp16v2 __attribute__((ext_vector_type(2))); // CHECK: call <2 x half> @llvm.nvvm.fmax.ftz.xorsign.abs.f16x2(<2 x half> {{.*}}, <2 x half> {{.*}}) // CHECK: call <2 x half> @llvm.nvvm.fmax.nan.xorsign.abs.f16x2(<2 x half> {{.*}}, <2 x half> {{.*}}) // CHECK: call <2 x half> @llvm.nvvm.fmax.ftz.nan.xorsign.abs.f16x2(<2 x half> {{.*}}, <2 x half> {{.*}}) -// CHECK: call half @llvm.nvvm.ldg.global.f.f16.p0(ptr {{.*}}, i32 2) -// CHECK: call <2 x half> @llvm.nvvm.ldg.global.f.v2f16.p0(ptr {{.*}}, i32 4) +// CHECK: load half, ptr addrspace(1) {{.*}}, align 2, !invariant.load +// CHECK: load <2 x half>, ptr addrspace(1) {{.*}}, align 4, !invariant.load // CHECK: call half @llvm.nvvm.ldu.global.f.f16.p0(ptr {{.*}}, i32 2) // CHECK: call <2 x half> @llvm.nvvm.ldu.global.f.v2f16.p0(ptr {{.*}}, i32 4) __device__ void nvvm_native_half_types(void *a, void*b, void*c, __fp16* out) { diff --git a/clang/test/CodeGen/builtins-nvptx-native-half-type.c b/clang/test/CodeGen/builtins-nvptx-native-half-type.c index 4aeae953bc1622..511497702ff7f9 100644 --- a/clang/test/CodeGen/builtins-nvptx-native-half-type.c +++ b/clang/test/CodeGen/builtins-nvptx-native-half-type.c @@ -177,9 +177,9 @@ typedef __fp16 __fp16v2 __attribute__((ext_vector_type(2))); // CHECK-LABEL: nvvm_ldg_native_half_types __device__ void nvvm_ldg_native_half_types(const void *p) { - // CHECK: call half @llvm.nvvm.ldg.global.f.f16.p0 + // CHECK: load half, ptr addrspace(1) {{.*}}, align 2, !invariant.load __nvvm_ldg_h((const __fp16 *)p); - // CHECK: call <2 x half> @llvm.nvvm.ldg.global.f.v2f16.p0 + // CHECK: load <2 x half>, ptr addrspace(1) {{.*}}, align 4, !invariant.load __nvvm_ldg_h2((const __fp16v2 *)p); } diff --git a/clang/test/CodeGen/builtins-nvptx.c b/clang/test/CodeGen/builtins-nvptx.c index 0d0e3ecdb90c9e..3406cbdde2bf88 100644 --- a/clang/test/CodeGen/builtins-nvptx.c +++ b/clang/test/CodeGen/builtins-nvptx.c @@ -598,33 +598,33 @@ __device__ void nvvm_atom(float *fp, float f, double *dfp, double df, // CHECK-LABEL: nvvm_ldg __device__ void nvvm_ldg(const void *p) { - // CHECK: call i8 @llvm.nvvm.ldg.global.i.i8.p0(ptr {{%[0-9]+}}, i32 1) - // CHECK: call i8 @llvm.nvvm.ldg.global.i.i8.p0(ptr {{%[0-9]+}}, i32 1) - // CHECK: call i8 @llvm.nvvm.ldg.global.i.i8.p0(ptr {{%[0-9]+}}, i32 1) + // CHECK: load i8, ptr addrspace(1) {{%[0-9]+}}, align 1, !invariant.load + // CHECK: load i8, ptr addrspace(1) {{%[0-9]+}}, align 1, !invariant.load + // CHECK: load i8, ptr addrspace(1) {{%[0-9]+}}, align 1, !invariant.load __nvvm_ldg_c((const char *)p); __nvvm_ldg_uc((const unsigned char *)p); __nvvm_ldg_sc((const signed char *)p); - // CHECK: call i16 @llvm.nvvm.ldg.global.i.i16.p0(ptr {{%[0-9]+}}, i32 2) - // CHECK: call i16 @llvm.nvvm.ldg.global.i.i16.p0(ptr {{%[0-9]+}}, i32 2) + // CHECK: load i16, ptr addrspace(1) {{%[0-9]+}}, align 2, !invariant.load + // CHECK: load i16, ptr addrspace(1) {{%[0-9]+}}, align 2, !invariant.load __nvvm_ldg_s((const short *)p); __nvvm_ldg_us((const unsigned short *)p); - // CHECK: call i32 @llvm.nvvm.ldg.global.i.i32.p0(ptr {{%[0-9]+}}, i32 4) - // CHECK: call i32 @llvm.nvvm.ldg.global.i.i32.p0(ptr {{%[0-9]+}}, i32 4) + // CHECK: load i32, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load + // CHECK: load i32, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load __nvvm_ldg_i((const int *)p); __nvvm_ldg_ui((const unsigned int *)p); - // LP32: call i32 @llvm.nvvm.ldg.global.i.i32.p0(ptr {{%[0-9]+}}, i32 4) - // LP32: call i32 @llvm.nvvm.ldg.global.i.i32.p0(ptr {{%[0-9]+}}, i32 4) - // LP64: call i64 @llvm.nvvm.ldg.global.i.i64.p0(ptr {{%[0-9]+}}, i32 8) - // LP64: call i64 @llvm.nvvm.ldg.global.i.i64.p0(ptr {{%[0-9]+}}, i32 8) + // LP32: load i32, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load + // LP32: load i32, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load + // LP64: load i64, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load + // LP64: load i64, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load __nvvm_ldg_l((const long *)p); __nvvm_ldg_ul((const unsigned long *)p); - // CHECK: call float @llvm.nvvm.ldg.global.f.f32.p0(ptr {{%[0-9]+}}, i32 4) + // CHECK: load float, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load __nvvm_ldg_f((const float *)p); - // CHECK: call double @llvm.nvvm.ldg.global.f.f64.p0(ptr {{%[0-9]+}}, i32 8) + // CHECK: load double, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load __nvvm_ldg_d((const double *)p); // In practice, the pointers we pass to __ldg will be aligned as appropriate @@ -636,9 +636,9 @@ __device__ void nvvm_ldg(const void *p) { // elements, its alignment is set to number of elements times the alignment of // its member: n*alignof(t)." - // CHECK: call <2 x i8> @llvm.nvvm.ldg.global.i.v2i8.p0(ptr {{%[0-9]+}}, i32 2) - // CHECK: call <2 x i8> @llvm.nvvm.ldg.global.i.v2i8.p0(ptr {{%[0-9]+}}, i32 2) - // CHECK: call <2 x i8> @llvm.nvvm.ldg.global.i.v2i8.p0(ptr {{%[0-9]+}}, i32 2) + // CHECK: load <2 x i8>, ptr addrspace(1) {{%[0-9]+}}, align 2, !invariant.load + // CHECK: load <2 x i8>, ptr addrspace(1) {{%[0-9]+}}, align 2, !invariant.load + // CHECK: load <2 x i8>, ptr addrspace(1) {{%[0-9]+}}, align 2, !invariant.load typedef char char2 __attribute__((ext_vector_type(2))); typedef unsigned char uchar2 __attribute__((ext_vector_type(2))); typedef signed char schar2 __attribute__((ext_vector_type(2))); @@ -646,9 +646,9 @@ __device__ void nvvm_ldg(const void *p) { __nvvm_ldg_uc2((const uchar2 *)p); __nvvm_ldg_sc2((const schar2 *)p); - // CHECK: call <4 x i8> @llvm.nvvm.ldg.global.i.v4i8.p0(ptr {{%[0-9]+}}, i32 4) - // CHECK: call <4 x i8> @llvm.nvvm.ldg.global.i.v4i8.p0(ptr {{%[0-9]+}}, i32 4) - // CHECK: call <4 x i8> @llvm.nvvm.ldg.global.i.v4i8.p0(ptr {{%[0-9]+}}, i32 4) + // CHECK: load <4 x i8>, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load + // CHECK: load <4 x i8>, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load + // CHECK: load <4 x i8>, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load typedef char char4 __attribute__((ext_vector_type(4))); typedef unsigned char uchar4 __attribute__((ext_vector_type(4))); typedef signed char schar4 __attribute__((ext_vector_type(4))); @@ -656,59 +656,59 @@ __device__ void nvvm_ldg(const void *p) { __nvvm_ldg_uc4((const uchar4 *)p); __nvvm_ldg_sc4((const schar4 *)p); - // CHECK: call <2 x i16> @llvm.nvvm.ldg.global.i.v2i16.p0(ptr {{%[0-9]+}}, i32 4) - // CHECK: call <2 x i16> @llvm.nvvm.ldg.global.i.v2i16.p0(ptr {{%[0-9]+}}, i32 4) + // CHECK: load <2 x i16>, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load + // CHECK: load <2 x i16>, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load typedef short short2 __attribute__((ext_vector_type(2))); typedef unsigned short ushort2 __attribute__((ext_vector_type(2))); __nvvm_ldg_s2((const short2 *)p); __nvvm_ldg_us2((const ushort2 *)p); - // CHECK: call <4 x i16> @llvm.nvvm.ldg.global.i.v4i16.p0(ptr {{%[0-9]+}}, i32 8) - // CHECK: call <4 x i16> @llvm.nvvm.ldg.global.i.v4i16.p0(ptr {{%[0-9]+}}, i32 8) + // CHECK: load <4 x i16>, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load + // CHECK: load <4 x i16>, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load typedef short short4 __attribute__((ext_vector_type(4))); typedef unsigned short ushort4 __attribute__((ext_vector_type(4))); __nvvm_ldg_s4((const short4 *)p); __nvvm_ldg_us4((const ushort4 *)p); - // CHECK: call <2 x i32> @llvm.nvvm.ldg.global.i.v2i32.p0(ptr {{%[0-9]+}}, i32 8) - // CHECK: call <2 x i32> @llvm.nvvm.ldg.global.i.v2i32.p0(ptr {{%[0-9]+}}, i32 8) + // CHECK: load <2 x i32>, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load + // CHECK: load <2 x i32>, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load typedef int int2 __attribute__((ext_vector_type(2))); typedef unsigned int uint2 __attribute__((ext_vector_type(2))); __nvvm_ldg_i2((const int2 *)p); __nvvm_ldg_ui2((const uint2 *)p); - // CHECK: call <4 x i32> @llvm.nvvm.ldg.global.i.v4i32.p0(ptr {{%[0-9]+}}, i32 16) - // CHECK: call <4 x i32> @llvm.nvvm.ldg.global.i.v4i32.p0(ptr {{%[0-9]+}}, i32 16) + // CHECK: load <4 x i32>, ptr addrspace(1) {{%[0-9]+}}, align 16, !invariant.load + // CHECK: load <4 x i32>, ptr addrspace(1) {{%[0-9]+}}, align 16, !invariant.load typedef int int4 __attribute__((ext_vector_type(4))); typedef unsigned int uint4 __attribute__((ext_vector_type(4))); __nvvm_ldg_i4((const int4 *)p); __nvvm_ldg_ui4((const uint4 *)p); - // LP32: call <2 x i32> @llvm.nvvm.ldg.global.i.v2i32.p0(ptr {{%[0-9]+}}, i32 8) - // LP32: call <2 x i32> @llvm.nvvm.ldg.global.i.v2i32.p0(ptr {{%[0-9]+}}, i32 8) - // LP64: call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr {{%[0-9]+}}, i32 16) - // LP64: call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr {{%[0-9]+}}, i32 16) + // LP32: load <2 x i32>, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load + // LP32: load <2 x i32>, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load + // LP64: load <2 x i64>, ptr addrspace(1) {{%[0-9]+}}, align 16, !invariant.load + // LP64: load <2 x i64>, ptr addrspace(1) {{%[0-9]+}}, align 16, !invariant.load typedef long long2 __attribute__((ext_vector_type(2))); typedef unsigned long ulong2 __attribute__((ext_vector_type(2))); __nvvm_ldg_l2((const long2 *)p); __nvvm_ldg_ul2((const ulong2 *)p); - // CHECK: call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr {{%[0-9]+}}, i32 16) - // CHECK: call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr {{%[0-9]+}}, i32 16) + // CHECK: load <2 x i64>, ptr addrspace(1) {{%[0-9]+}}, align 16, !invariant.load + // CHECK: load <2 x i64>, ptr addrspace(1) {{%[0-9]+}}, align 16, !invariant.load typedef long long longlong2 __attribute__((ext_vector_type(2))); typedef unsigned long long ulonglong2 __attribute__((ext_vector_type(2))); __nvvm_ldg_ll2((const longlong2 *)p); __nvvm_ldg_ull2((const ulonglong2 *)p); - // CHECK: call <2 x float> @llvm.nvvm.ldg.global.f.v2f32.p0(ptr {{%[0-9]+}}, i32 8) + // CHECK: load <2 x float>, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load typedef float float2 __attribute__((ext_vector_type(2))); __nvvm_ldg_f2((const float2 *)p); - // CHECK: call <4 x float> @llvm.nvvm.ldg.global.f.v4f32.p0(ptr {{%[0-9]+}}, i32 16) + // CHECK: load <4 x float>, ptr addrspace(1) {{%[0-9]+}}, align 16, !invariant.load typedef float float4 __attribute__((ext_vector_type(4))); __nvvm_ldg_f4((const float4 *)p); - // CHECK: call <2 x double> @llvm.nvvm.ldg.global.f.v2f64.p0(ptr {{%[0-9]+}}, i32 16) + // CHECK: load <2 x double>, ptr addrspace(1) {{%[0-9]+}}, align 16, !invariant.load typedef double double2 __attribute__((ext_vector_type(2))); __nvvm_ldg_d2((const double2 *)p); } diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td index 7b8ffe417fccdb..3cc45adb198e26 100644 --- a/llvm/include/llvm/IR/IntrinsicsNVVM.td +++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -42,6 +42,9 @@ // * llvm.nvvm.ptr.shared.to.gen --> ibid. // * llvm.nvvm.ptr.constant.to.gen --> ibid. // * llvm.nvvm.ptr.local.to.gen --> ibid. +// * llvm.nvvm.ldg.global.i --> load addrspace(1) !load.invariant +// * llvm.nvvm.ldg.global.f --> ibid. +// * llvm.nvvm.ldg.global.p --> ibid. def llvm_global_ptr_ty : LLVMQualPointerType<1>; // (global)ptr def llvm_shared_ptr_ty : LLVMQualPointerType<3>; // (shared)ptr @@ -1595,21 +1598,6 @@ def int_nvvm_ldu_global_p : Intrinsic<[llvm_anyptr_ty], [IntrReadMem, IntrArgMemOnly, IntrNoCallback, IntrWillReturn, NoCapture<ArgIndex<0>>], "llvm.nvvm.ldu.global.p">; -// Generated within nvvm. Use for ldg on sm_35 or later. Second arg is the -// pointer's alignment. -def int_nvvm_ldg_global_i : Intrinsic<[llvm_anyint_ty], - [llvm_anyptr_ty, llvm_i32_ty], - [IntrReadMem, IntrArgMemOnly, IntrNoCallback, IntrWillReturn, NoCapture<ArgIndex<0>>], - "llvm.nvvm.ldg.global.i">; -def int_nvvm_ldg_global_f : Intrinsic<[llvm_anyfloat_ty], - [llvm_anyptr_ty, llvm_i32_ty], - [IntrReadMem, IntrArgMemOnly, IntrNoCallback, IntrWillReturn, NoCapture<ArgIndex<0>>], - "llvm.nvvm.ldg.global.f">; -def int_nvvm_ldg_global_p : Intrinsic<[llvm_anyptr_ty], - [llvm_anyptr_ty, llvm_i32_ty], - [IntrReadMem, IntrArgMemOnly, IntrNoCallback, IntrWillReturn, NoCapture<ArgIndex<0>>], - "llvm.nvvm.ldg.global.p">; - // Used in nvvm internally to help address space opt and ptx code generation // This is for params that are passed to kernel functions by pointer by-val. def int_nvvm_ptr_gen_to_param: Intrinsic<[llvm_anyptr_ty], diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp index bb03c9290e4cf4..73882fbc7a251a 100644 --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -37,6 +37,7 @@ #include "llvm/IR/MDBuilder.h" #include "llvm/IR/Metadata.h" #include "llvm/IR/Module.h" +#include "llvm/IR/Value.h" #include "llvm/IR/Verifier.h" #include "llvm/Support/AMDGPUAddrSpace.h" #include "llvm/Support/CommandLine.h" @@ -1301,6 +1302,10 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn, (Name.consume_front("local") || Name.consume_front("shared") || Name.consume_front("global") || Name.consume_front("constant")) && Name.starts_with(".to.gen"); + else if (Name.consume_front("ldg.global.")) + // nvvm.ldg.global.{i,p,f} + Expand = (Name.starts_with("i.") || Name.starts_with("f.") || + Name.starts_with("p.")); else Expand = false; @@ -2363,6 +2368,15 @@ static Value *upgradeNVVMIntrinsicCall(StringRef Name, CallBase *CI, Name.consume_front("constant")) && Name.starts_with(".to.gen"))) { Rep = Builder.CreateAddrSpaceCast(CI->getArgOperand(0), CI->getType()); + } else if (Name.consume_front("ldg.global")) { + Value *Ptr = CI->getArgOperand(0); + Align PtrAlign = cast<ConstantInt>(CI->getArgOperand(1))->getAlignValue(); + // Use addrspace(1) for NVPTX ADDRESS_SPACE_GLOBAL + Value *ASC = Builder.CreateAddrSpaceCast(Ptr, Builder.getPtrTy(1)); + Instruction *LD = Builder.CreateAlignedLoad(CI->getType(), ASC, PtrAlign); + MDNode *MD = MDNode::get(Builder.getContext(), {}); + LD->setMetadata(LLVMContext::MD_invariant_load, MD); + return LD; } else { Intrinsic::ID IID = shouldUpgradeNVPTXBF16Intrinsic(Name); if (IID != Intrinsic::not_intrinsic && diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index 93c2d92ef7c1c8..965ed98630a28d 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -126,8 +126,6 @@ void NVPTXDAGToDAGISel::Select(SDNode *N) { if (tryLoadVector(N)) return; break; - case NVPTXISD::LDGV2: - case NVPTXISD::LDGV4: case NVPTXISD::LDUV2: case NVPTXISD::LDUV4: if (tryLDGLDU(N)) @@ -550,9 +548,6 @@ bool NVPTXDAGToDAGISel::tryIntrinsicChain(SDNode *N) { switch (IID) { default: return false; - case Intrinsic::nvvm_ldg_global_f: - case Intrinsic::nvvm_ldg_global_i: - case Intrinsic::nvvm_ldg_global_p: case Intrinsic::nvvm_ldu_global_f: case Intrinsic::nvvm_ldu_global_i: case Intrinsic::nvvm_ldu_global_p: @@ -1559,34 +1554,11 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { } bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) { - SDValue Op1; - MemSDNode *Mem; - bool IsLDG = true; + auto *Mem = cast<MemSDNode>(N); // If this is an LDG intrinsic, the address is the third operand. If its an // LDG/LDU SD node (from custom vector handling), then its the second operand - if (N->getOpcode() == ISD::INTRINSIC_W_CHAIN) { - Op1 = N->getOperand(2); - Mem = cast<MemIntrinsicSDNode>(N); - unsigned IID = N->getConstantOperandVal(1); - switch (IID) { - default: - return false; - case Intrinsic::nvvm_ldg_global_f: - case Intrinsic::nvvm_ldg_global_i: - case Intrinsic::nvvm_ldg_global_p: - IsLDG = true; - break; - case Intrinsic::nvvm_ldu_global_f: - case Intrinsic::nvvm_ldu_global_i: - case Intrinsic::nvvm_ldu_global_p: - IsLDG = false; - break; - } - } else { - Op1 = N->getOperand(1); - Mem = cast<MemSDNode>(N); - } + SDValue Op1 = N->getOperand(N->getOpcode() == ISD::INTRINSIC_W_CHAIN ? 2 : 1); EVT OrigType = N->getValueType(0); EVT EltVT = Mem->getMemoryVT(); @@ -1629,26 +1601,20 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) { default: return false; case ISD::LOAD: + Opcode = pickOpcodeForVT( + EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_GLOBAL_i8avar, + NVPTX::INT_PTX_LDG_GLOBAL_i16avar, NVPTX::INT_PTX_LDG_GLOBAL_i32avar, + NVPTX::INT_PTX_LDG_GLOBAL_i64avar, NVPTX::INT_PTX_LDG_GLOBAL_f32avar, + NVPTX::INT_PTX_LDG_GLOBAL_f64avar); + break; case ISD::INTRINSIC_W_CHAIN: - if (IsLDG) - Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, - NVPTX::INT_PTX_LDG_GLOBAL_i8avar, - NVPTX::INT_PTX_LDG_GLOBAL_i16avar, - NVPTX::INT_PTX_LDG_GLOBAL_i32avar, - NVPTX::INT_PTX_LDG_GLOBAL_i64avar, - NVPTX::INT_PTX_LDG_GLOBAL_f32avar, - NVPTX::INT_PTX_LDG_GLOBAL_f64avar); - else - Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, - NVPTX::INT_PTX_LDU_GLOBAL_i8avar, - NVPTX::INT_PTX_LDU_GLOBAL_i16avar, - NVPTX::INT_PTX_LDU_GLOBAL_i32avar, - NVPTX::INT_PTX_LDU_GLOBAL_i64avar, - NVPTX::INT_PTX_LDU_GLOBAL_f32avar, - NVPTX::INT_PTX_LDU_GLOBAL_f64avar); + Opcode = pickOpcodeForVT( + EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_GLOBAL_i8avar, + NVPTX::INT_PTX_LDU_GLOBAL_i16avar, NVPTX::INT_PTX_LDU_GLOBAL_i32avar, + NVPTX::INT_PTX_LDU_GLOBAL_i64avar, NVPTX::INT_PTX_LDU_GLOBAL_f32avar, + NVPTX::INT_PTX_LDU_GLOBAL_f64avar); break; case NVPTXISD::LoadV2: - case NVPTXISD::LDGV2: Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v2i8_ELE_avar, NVPTX::INT_PTX_LDG_G_v2i16_ELE_avar, @@ -1667,7 +1633,6 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) { NVPTX::INT_PTX_LDU_G_v2f64_ELE_avar); break; case NVPTXISD::LoadV4: - case NVPTXISD::LDGV4: Opcode = pickOpcodeForVT( EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_avar, NVPTX::INT_PTX_LDG_G_v4i16_ELE_avar, @@ -1693,26 +1658,24 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) { default: return false; case ISD::LOAD: + Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, + NVPTX::INT_PTX_LDG_GLOBAL_i8ari64, + NVPTX::INT_PTX_LDG_GLOBAL_i16ari64, + NVPTX::INT_PTX_LDG_GLOBAL_i32ari64, + NVPTX::INT_PTX_LDG_GLOBAL_i64ari64, + NVPTX::INT_PTX_LDG_GLOBAL_f32ari64, + NVPTX::INT_PTX_LDG_GLOBAL_f64ari64); + break; case ISD::INTRINSIC_W_CHAIN: - if (IsLDG) - Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, - NVPTX::INT_PTX_LDG_GLOBAL_i8ari64, - NVPTX::INT_PTX_LDG_GLOBAL_i16ari64, - NVPTX::INT_PTX_LDG_GLOBAL_i32ari64, - NVPTX::INT_PTX_LDG_GLOBAL_i64ari64, - NVPTX::INT_PTX_LDG_GLOBAL_f32ari64, - NVPTX::INT_PTX_LDG_GLOBAL_f64ari64); - else - Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, - NVPTX::INT_PTX_LDU_GLOBAL_i8ari64, - NVPTX::INT_PTX_LDU_GLOBAL_i16ari64, - NVPTX::INT_PTX_LDU_GLOBAL_i32ari64, - NVPTX::INT_PTX_LDU_GLOBAL_i64ari64, - NVPTX::INT_PTX_LDU_GLOBAL_f32ari64, - NVPTX::INT_PTX_LDU_GLOBAL_f64ari64); + Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, + NVPTX::INT_PTX_LDU_GLOBAL_i8ari64, + NVPTX::INT_PTX_LDU_GLOBAL_i16ari64, + NVPTX::INT_PTX_LDU_GLOBAL_i32ari64, + NVPTX::INT_PTX_LDU_GLOBAL_i64ari64, + NVPTX::INT_PTX_LDU_GLOBAL_f32ari64, + NVPTX::INT_PTX_LDU_GLOBAL_f64ari64); break; case NVPTXISD::LoadV2: - case NVPTXISD::LDGV2: Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari64, NVPTX::INT_PTX_LDG_G_v2i16_ELE_ari64, @@ -1731,7 +1694,6 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) { NVPTX::INT_PTX_LDU_G_v2f64_ELE_ari64); break; case NVPTXISD::LoadV4: - case NVPTXISD::LDGV4: Opcode = pickOpcodeForVT( EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari64, NVPTX::INT_PTX_LDG_G_v4i16_ELE_ari64, @@ -1751,26 +1713,20 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) { default: return false; case ISD::LOAD: + Opcode = pickOpcodeForVT( + EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_GLOBAL_i8ari, + NVPTX::INT_PTX_LDG_GLOBAL_i16ari, NVPTX::INT_PTX_LDG_GLOBAL_i32ari, + NVPTX::INT_PTX_LDG_GLOBAL_i64ari, NVPTX::INT_PTX_LDG_GLOBAL_f32ari, + NVPTX::INT_PTX_LDG_GLOBAL_f64ari); + break; case ISD::INTRINSIC_W_CHAIN: - if (IsLDG) - Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, - NVPTX::INT_PTX_LDG_GLOBAL_i8ari, - NVPTX::INT_PTX_LDG_GLOBAL_i16ari, - NVPTX::INT_PTX_LDG_GLOBAL_i32ari, - NVPTX::INT_PTX_LDG_GLOBAL_i64ari, - NVPTX::INT_PTX_LDG_GLOBAL_f32ari, - NVPTX::INT_PTX_LDG_GLOBAL_f64ari); - else - Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, - NVPTX::INT_PTX_LDU_GLOBAL_i8ari, - NVPTX::INT_PTX_LDU_GLOBAL_i16ari, - NVPTX::INT_PTX_LDU_GLOBAL_i32ari, - NVPTX::INT_PTX_LDU_GLOBAL_i64ari, - NVPTX::INT_PTX_LDU_GLOBAL_f32ari, - NVPTX::INT_PTX_LDU_GLOBAL_f64ari); + Opcode = pickOpcodeForVT( + EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_GLOBAL_i8ari, + NVPTX::INT_PTX_LDU_GLOBAL_i16ari, NVPTX::INT_PTX_LDU_GLOBAL_i32ari, + NVPTX::INT_PTX_LDU_GLOBAL_i64ari, NVPTX::INT_PTX_LDU_GLOBAL_f32ari, + NVPTX::INT_PTX_LDU_GLOBAL_f64ari); break; case NVPTXISD::LoadV2: - case NVPTXISD::LDGV2: Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari32, NVPTX::INT_PTX_LDG_G_v2i16_ELE_ari32, @@ -1789,7 +1745,6 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) { NVPTX::INT_PTX_LDU_G_v2f64_ELE_ari32); break; case NVPTXISD::LoadV4: - case NVPTXISD::LDGV4: Opcode = pickOpcodeForVT( EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari32, NVPTX::INT_PTX_LDG_G_v4i16_ELE_ari32, @@ -1815,26 +1770,24 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) { default: return false; case ISD::LOAD: + Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, + NVPTX::INT_PTX_LDG_GLOBAL_i8areg64, + NVPTX::INT_PTX_LDG_GLOBAL_i16areg64, + NVPTX::INT_PTX_LDG_GLOBAL_i32areg64, + NVPTX::INT_PTX_LDG_GLOBAL_i64areg64, + NVPTX::INT_PTX_LDG_GLOBAL_f32areg64, + NVPTX::INT_PTX_LDG_GLOBAL_f64areg64); + break; case ISD::INTRINSIC_W_CHAIN: - if (IsLDG) - Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, - NVPTX::INT_PTX_LDG_GLOBAL_i8areg64, - NVPTX::INT_PTX_LDG_GLOBAL_i16areg64, - NVPTX::INT_PTX_LDG_GLOBAL_i32areg64, - NVPTX::INT_PTX_LDG_GLOBAL_i64areg64, - NVPTX::INT_PTX_LDG_GLOBAL_f32areg64, - NVPTX::INT_PTX_LDG_GLOBAL_f64areg64); - else - Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, - NVPTX::INT_PTX_LDU_GLOBAL_i8areg64, - NVPTX::INT_PTX_LDU_GLOBAL_i16areg64, - NVPTX::INT_PTX_LDU_GLOBAL_i32areg64, - NVPTX::INT_PTX_LDU_GLOBAL_i64areg64, - NVPTX::INT_PTX_LDU_GLOBAL_f32areg64, - NVPTX::INT_PTX_LDU_GLOBAL_f64areg64); + Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, + NVPTX::INT_PTX_LDU_GLOBAL_i8areg64, + NVPTX::INT_PTX_LDU_GLOBAL_i16areg64, + NVPTX::INT_PTX_LDU_GLOBAL_i32areg64, + NVPTX::INT_PTX_LDU_GLOBAL_i64areg64, + NVPTX::INT_PTX_LDU_GLOBAL_f32areg64, + NVPTX::INT_PTX_LDU_GLOBAL_f64areg64); break; case NVPTXISD::LoadV2: - case NVPTXISD::LDGV2: Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg64, NVPTX::INT_PTX_LDG_G_v2i16_ELE_areg64, @@ -1853,7 +1806,6 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) { NVPTX::INT_PTX_LDU_G_v2f64_ELE_areg64); break; case NVPTXISD::LoadV4: - case NVPTXISD::LDGV4: Opcode = pickOpcodeForVT( EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg64, NVPTX::INT_PTX_LDG_G_v4i16_ELE_areg64, @@ -1873,26 +1825,24 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) { default: return false; case ISD::LOAD: + Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, + NVPTX::INT_PTX_LDG_GLOBAL_i8areg, + NVPTX::INT_PTX_LDG_GLOBAL_i16areg, + NVPTX::INT_PTX_LDG_GLOBAL_i32areg, + NVPTX::INT_PTX_LDG_GLOBAL_i64areg, + NVPTX::INT_PTX_LDG_GLOBAL_f32areg, + NVPTX::INT_PTX_LDG_GLOBAL_f64areg); + break; case ISD::INTRINSIC_W_CHAIN: - if (IsLDG) - Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, - NVPTX::INT_PTX_LDG_GLOBAL_i8areg, - NVPTX::INT_PTX_LDG_GLOBAL_i16areg, - NVPTX::INT_PTX_LDG_GLOBAL_i32areg, - NVPTX::INT_PTX_LDG_GLOBAL_i64areg, - NVPTX::INT_PTX_LDG_GLOBAL_f32areg, - NVPTX::INT_PTX_LDG_GLOBAL_f64areg); - else - Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, - NVPTX::INT_PTX_LDU_GLOBAL_i8areg, - NVPTX::INT_PTX_LDU_GLOBAL_i16areg, - NVPTX::INT_PTX_LDU_GLOBAL_i32areg, - NVPTX::INT_PTX_LDU_GLOBAL_i64areg, - NVPTX::INT_PTX_LDU_GLOBAL_f32areg, - NVPTX::INT_PTX_LDU_GLOBAL_f64areg); + Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, + NVPTX::INT_PTX_LDU_GLOBAL_i8areg, + NVPTX::INT_PTX_LDU_GLOBAL_i16areg, + NVPTX::INT_PTX_LDU_GLOBAL_i32areg, + NVPTX::INT_PTX_LDU_GLOBAL_i64areg, + NVPTX::INT_PTX_LDU_GLOBAL_f32areg, + NVPTX::INT_PTX_LDU_GLOBAL_f64areg); break; case NVPTXISD::LoadV2: - case NVPTXISD::LDGV2: Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg32, NVPTX::INT_PTX_LDG_G_v2i16_ELE_areg32, @@ -1911,7 +1861,6 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) { NVPTX::INT_PTX_LDU_G_v2f64_ELE_areg32); break; case NVPTXISD::LoadV4: - case NVPTXISD::LDGV4: Opcode = pickOpcodeForVT( EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg32, NVPTX::INT_PTX_LDG_G_v4i16_ELE_areg32, diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index 57bc5fe0ac361c..a95cba586b8fc3 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -949,8 +949,6 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const { MAKE_CASE(NVPTXISD::ProxyReg) MAKE_CASE(NVPTXISD::LoadV2) MAKE_CASE(NVPTXISD::LoadV4) - MAKE_CASE(NVPTXISD::LDGV2) - MAKE_CASE(NVPTXISD::LDGV4) MAKE_CASE(NVPTXISD::LDUV2) MAKE_CASE(NVPTXISD::LDUV4) MAKE_CASE(NVPTXISD::StoreV2) @@ -4774,26 +4772,6 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic( return true; } - case Intrinsic::nvvm_ldg_global_i: - case Intrinsic::nvvm_ldg_global_f: - case Intrinsic::nvvm_ldg_global_p: { - auto &DL = I.getDataLayout(); - - Info.opc = ISD::INTRINSIC_W_CHAIN; - if (Intrinsic == Intrinsic::nvvm_ldg_global_i) - Info.memVT = getValueType(DL, I.getType()); - else if(Intrinsic == Intrinsic::nvvm_ldg_global_p) - Info.memVT = getPointerTy(DL); - else - Info.memVT = getValueType(DL, I.getType()); - Info.ptrVal = I.getArgOperand(0); - Info.offset = 0; - Info.flags = MachineMemOperand::MOLoad; - Info.align = cast<ConstantInt>(I.getArgOperand(1))->getMaybeAlignValue(); - - return true; - } - case Intrinsic::nvvm_tex_1d_v4f32_s32: case Intrinsic::nvvm_tex_1d_v4f32_f32: case Intrinsic::nvvm_tex_1d_level_v4f32_f32: @@ -6308,9 +6286,6 @@ static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG, switch (IntrinNo) { default: return; - case Intrinsic::nvvm_ldg_global_i: - case Intrinsic::nvvm_ldg_global_f: - case Intrinsic::nvvm_ldg_global_p: case Intrinsic::nvvm_ldu_global_i: case Intrinsic::nvvm_ldu_global_f: case Intrinsic::nvvm_ldu_global_p: { @@ -6339,37 +6314,11 @@ static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG, default: return; case 2: - switch (IntrinNo) { - default: - return; - case Intrinsic::nvvm_ldg_global_i: - case Intrinsic::nvvm_ldg_global_f: - case Intrinsic::nvvm_ldg_global_p: - Opcode = NVPTXISD::LDGV2; - break; - case Intrinsic::nvvm_ldu_global_i: - case Intrinsic::nvvm_ldu_global_f: - case Intrinsic::nvvm_ldu_global_p: - Opcode = NVPTXISD::LDUV2; - break; - } + Opcode = NVPTXISD::LDUV2; LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other); break; case 4: { - switch (IntrinNo) { - default: - return; - case Intrinsic::nvvm_ldg_global_i: - case Intrinsic::nvvm_ldg_global_f: - case Intrinsic::nvvm_ldg_global_p: - Opcode = NVPTXISD::LDGV4; - break; - case Intrinsic::nvvm_ldu_global_i: - case Intrinsic::nvvm_ldu_global_f: - case Intrinsic::nvvm_ldu_global_p: - Opcode = NVPTXISD::LDUV4; - break; - } + Opcode = NVPTXISD::LDUV4; EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other }; LdResVTs = DAG.getVTList(ListVTs); break; diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h index 8c3a597ce0b085..824a659671967a 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h @@ -70,8 +70,6 @@ enum NodeType : unsigned { LoadV2 = ISD::FIRST_TARGET_MEMORY_OPCODE, LoadV4, - LDGV2, // LDG.v2 - LDGV4, // LDG.v4 LDUV2, // LDU.v2 LDUV4, // LDU.v4 StoreV2, diff --git a/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll b/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll index 584c0ef7cfeb78..5cc3a30277459b 100644 --- a/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll +++ b/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll @@ -44,6 +44,13 @@ declare ptr @llvm.nvvm.ptr.shared.to.gen.p0.p3(ptr addrspace(3)) declare ptr @llvm.nvvm.ptr.constant.to.gen.p0.p4(ptr addrspace(4)) declare ptr @llvm.nvvm.ptr.local.to.gen.p0.p5(ptr addrspace(5)) +declare i32 @llvm.nvvm.ldg.global.i.i32.p1(ptr addrspace(1), i32) +declare ptr @llvm.nvvm.ldg.global.p.p1(ptr addrspace(1), i32) +declare float @llvm.nvvm.ldg.global.f.f32.p1(ptr addrspace(1), i32) +declare i32 @llvm.nvvm.ldg.global.i.i32.p0(ptr, i32) +declare ptr @llvm.nvvm.ldg.global.p.p0(ptr, i32) +declare float @llvm.nvvm.ldg.global.f.f32.p0(ptr, i32) + ; CHECK-LABEL: @simple_upgrade define void @simple_upgrade(i32 %a, i64 %b, i16 %c) { ; CHECK: call i32 @llvm.bitreverse.i32(i32 %a) @@ -191,3 +198,27 @@ define void @addrspacecast(ptr %p0) { ret void } + +; CHECK-LABEL: @ldg +define void @ldg(ptr %p0, ptr addrspace(1) %p1) { +; CHECK: %1 = load i32, ptr addrspace(1) %p1, align 4, !invariant.load !0 +; CHECK: %2 = load ptr, ptr addrspace(1) %p1, align 8, !invariant.load !0 +; CHECK: %3 = load float, ptr addrspace(1) %p1, align 16, !invariant.load !0 + +; CHECK: %4 = addrspacecast ptr %p0 to ptr addrspace(1) +; CHECK: %5 = load i32, ptr addrspace(1) %4, align 4, !invariant.load !0 +; CHECK: %6 = addrspacecast ptr %p0 to ptr addrspace(1) +; CHECK: %7 = load ptr, ptr addrspace(1) %6, align 8, !invariant.load !0 +; CHECK: %8 = addrspacecast ptr %p0 to ptr addrspace(1) +; CHECK: %9 = load float, ptr addrspace(1) %8, align 16, !invariant.load !0 +; + %v1 = call i32 @llvm.nvvm.ldg.global.i.i32.p1(ptr addrspace(1) %p1, i32 4) + %v2 = call ptr @llvm.nvvm.ldg.global.p.p1(ptr addrspace(1) %p1, i32 8 ) + %v3 = call float @llvm.nvvm.ldg.global.f.f32.p1(ptr addrspace(1) %p1, i32 16) + + %v4 = call i32 @llvm.nvvm.ldg.global.i.i32.p0(ptr %p0, i32 4) + %v5 = call ptr @llvm.nvvm.ldg.global.p.p0(ptr %p0, i32 8) + %v6 = call float @llvm.nvvm.ldg.global.f.f32.p0(ptr %p0, i32 16) + + ret void +} \ No newline at end of file >From 07b4ef65f764e5520f82fc0611b36ece6661c9f6 Mon Sep 17 00:00:00 2001 From: Alex Maclean <amacl...@nvidia.com> Date: Sat, 19 Oct 2024 19:32:41 +0000 Subject: [PATCH 2/2] address formatting and comments --- clang/lib/CodeGen/CGBuiltin.cpp | 2 +- llvm/docs/ReleaseNotes.md | 9 ++++++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 40a875ab29c900..e05333a4c32803 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -20486,7 +20486,7 @@ static NVPTXMmaInfo getNVPTXMmaInfo(unsigned BuiltinID) { } static Value *MakeLdu(unsigned IntrinsicID, CodeGenFunction &CGF, - const CallExpr *E) { + const CallExpr *E) { Value *Ptr = CGF.EmitScalarExpr(E->getArg(0)); QualType ArgType = E->getArg(0)->getType(); clang::CharUnits Align = CGF.CGM.getNaturalPointeeTypeAlignment(ArgType); diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md index e5853789c78b63..417a085e14f06c 100644 --- a/llvm/docs/ReleaseNotes.md +++ b/llvm/docs/ReleaseNotes.md @@ -93,7 +93,14 @@ Changes to the LLVM IR * `llvm.nvvm.ptr.shared.to.gen` * `llvm.nvvm.ptr.constant.to.gen` * `llvm.nvvm.ptr.local.to.gen` - + +* Remove the following intrinsics which can be relaced with a load from + addrspace(1) with an !invariant.load metadata + + * `llvm.nvvm.ldg.global.i` + * `llvm.nvvm.ldg.global.f` + * `llvm.nvvm.ldg.global.p` + * Operand bundle values can now be metadata strings. Changes to LLVM infrastructure _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits