https://github.com/durga4github updated https://github.com/llvm/llvm-project/pull/124961
>From 467c3a41badb66b9187864a040c9eeccef1b583c Mon Sep 17 00:00:00 2001 From: Durgadoss R <durgado...@nvidia.com> Date: Wed, 29 Jan 2025 16:31:06 +0530 Subject: [PATCH] [NVPTX] Add tcgen05 alloc/dealloc intrinsics This patch adds intrinsics for the tcgen05 alloc/dealloc family of PTX instructions. This patch also adds addrspace 6 for tensor memory which is used by these intrinsics. lit tests are added and verified with a ptxas-12.8 executable. Documentation for these additions is also added in NVPTXUsage.rst. Signed-off-by: Durgadoss R <durgado...@nvidia.com> --- clang/lib/Basic/Targets/NVPTX.cpp | 9 +- clang/test/CodeGen/target-data.c | 4 +- llvm/docs/NVPTXUsage.rst | 98 ++++++++++++++ llvm/include/llvm/IR/IntrinsicsNVVM.td | 30 +++++ llvm/include/llvm/Support/NVPTXAddrSpace.h | 1 + llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 1 + llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 41 ++++++ llvm/lib/Target/NVPTX/NVPTXSubtarget.h | 15 +++ llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp | 3 + llvm/test/CodeGen/NVPTX/tcgen05-alloc.ll | 131 +++++++++++++++++++ 10 files changed, 327 insertions(+), 6 deletions(-) create mode 100644 llvm/test/CodeGen/NVPTX/tcgen05-alloc.ll diff --git a/clang/lib/Basic/Targets/NVPTX.cpp b/clang/lib/Basic/Targets/NVPTX.cpp index a03f4983b9d0384..017146a9ada14a3 100644 --- a/clang/lib/Basic/Targets/NVPTX.cpp +++ b/clang/lib/Basic/Targets/NVPTX.cpp @@ -62,12 +62,13 @@ NVPTXTargetInfo::NVPTXTargetInfo(const llvm::Triple &Triple, HasFloat16 = true; if (TargetPointerWidth == 32) - resetDataLayout("e-p:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64"); - else if (Opts.NVPTXUseShortPointers) resetDataLayout( - "e-p3:32:32-p4:32:32-p5:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64"); + "e-p:32:32-p6:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64"); + else if (Opts.NVPTXUseShortPointers) + resetDataLayout("e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-i64:64-i128:128-v16:" + "16-v32:32-n16:32:64"); else - resetDataLayout("e-i64:64-i128:128-v16:16-v32:32-n16:32:64"); + resetDataLayout("e-p6:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64"); // If possible, get a TargetInfo for our host triple, so we can match its // types. diff --git a/clang/test/CodeGen/target-data.c b/clang/test/CodeGen/target-data.c index 71eb849433ed40d..fe29aadb1dd532f 100644 --- a/clang/test/CodeGen/target-data.c +++ b/clang/test/CodeGen/target-data.c @@ -160,11 +160,11 @@ // RUN: %clang_cc1 -triple nvptx-unknown -o - -emit-llvm %s | \ // RUN: FileCheck %s -check-prefix=NVPTX -// NVPTX: target datalayout = "e-p:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" +// NVPTX: target datalayout = "e-p:32:32-p6:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" // RUN: %clang_cc1 -triple nvptx64-unknown -o - -emit-llvm %s | \ // RUN: FileCheck %s -check-prefix=NVPTX64 -// NVPTX64: target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" +// NVPTX64: target datalayout = "e-p6:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" // RUN: %clang_cc1 -triple r600-unknown -o - -emit-llvm %s | \ // RUN: FileCheck %s -check-prefix=R600 diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst index 64dd2b84a1763e7..dec6ad4e541152a 100644 --- a/llvm/docs/NVPTXUsage.rst +++ b/llvm/docs/NVPTXUsage.rst @@ -962,6 +962,104 @@ The ``griddepcontrol`` intrinsics allows the dependent grids and prerequisite gr For more information, refer `PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-griddepcontrol>`__. +TCGEN05 family of Intrinsics +---------------------------- + +The llvm.nvvm.tcgen05.* intrinsics model the TCGEN05 family of instructions +exposed by PTX. These intrinsics use 'Tensor Memory' (henceforth ``tmem``). +NVPTX represents this memory using ``addrspace(6)`` and is always 32-bits. + +For more information, refer to the PTX ISA +`<https://docs.nvidia.com/cuda/parallel-thread-execution/#tensor-memory>`_. + +The tensor-memory pointers may only be used with the tcgen05 intrinsics. +There are specialized load/store instructions provided (tcgen05.ld/st) to +work with tensor-memory. + +See the PTX ISA for more information on tensor-memory load/store instructions +`<https://docs.nvidia.com/cuda/parallel-thread-execution/#tensor-memory-and-register-load-store-instructions>`_. + +'``llvm.nvvm.tcgen05.alloc``' +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +.. code-block:: llvm + + declare void @llvm.nvvm.tcgen05.alloc.cg1(ptr %dst, i32 %ncols) + declare void @llvm.nvvm.tcgen05.alloc.cg2(ptr %dst, i32 %ncols) + declare void @llvm.nvvm.tcgen05.alloc.shared.cg1(ptr addrspace(3) %dst, i32 %ncols) + declare void @llvm.nvvm.tcgen05.alloc.shared.cg2(ptr addrspace(3) %dst, i32 %ncols) + +Overview: +""""""""" + +The '``@llvm.nvvm.tcgen05.alloc.*``' intrinsics correspond to the +``tcgen05.alloc.cta_group*.sync.aligned.b32`` family of PTX instructions. +The ``tcgen05.alloc`` is a potentially blocking instruction which dynamically +allocates the specified number of columns in the Tensor Memory and writes +the address of the allocated Tensor Memory into shared memory at the +location specified by ``%dst``. The 32-bit operand ``%ncols`` specifies +the number of columns to be allocated and it must be a power-of-two. +The ``.shared`` variant explicitly uses shared memory address space for +the ``%dst`` operand. The ``.cg1`` and ``.cg2`` variants generate +``cta_group::1`` and ``cta_group::2`` variants of the instruction respectively. + +For more information, refer to the PTX ISA +`<https://docs.nvidia.com/cuda/parallel-thread-execution/#tensor-memory-allocation-and-management-instructions>`_. + +'``llvm.nvvm.tcgen05.dealloc``' +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +.. code-block:: llvm + + declare void @llvm.nvvm.tcgen05.dealloc.cg1(ptr addrspace(6) %tmem_addr, i32 %ncols) + declare void @llvm.nvvm.tcgen05.dealloc.cg2(ptr addrspace(6) %tmem_addr, i32 %ncols) + +Overview: +""""""""" + +The '``@llvm.nvvm.tcgen05.dealloc.*``' intrinsics correspond to the +``tcgen05.dealloc.*`` set of PTX instructions. The ``tcgen05.dealloc`` +instructions deallocates the Tensor Memory specified by the Tensor Memory +address ``%tmem_addr``. The operand ``%tmem_addr`` must point to a previous +Tensor Memory allocation. The 32-bit operand ``%ncols`` specifies the number +of columns to be de-allocated. The ``.cg1`` and ``.cg2`` variants generate +``cta_group::1`` and ``cta_group::2`` variants of the instruction respectively. + +For more information, refer to the PTX ISA +`<https://docs.nvidia.com/cuda/parallel-thread-execution/#tensor-memory-allocation-and-management-instructions>`_. + +'``llvm.nvvm.tcgen05.relinq.alloc.permit``' +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +.. code-block:: llvm + + declare void @llvm.nvvm.tcgen05.relinq.alloc.permit.cg1() + declare void @llvm.nvvm.tcgen05.relinq.alloc.permit.cg2() + +Overview: +""""""""" + +The '``@llvm.nvvm.tcgen05.relinq.alloc.permit.*``' intrinsics correspond +to the ``tcgen05.relinquish_alloc_permit.*`` set of PTX instructions. +This instruction specifies that the CTA of the executing thread is +relinquishing the right to allocate Tensor Memory. So, it is illegal +for a CTA to perform ``tcgen05.alloc`` after any of its constituent +threads execute ``tcgen05.relinquish_alloc_permit``. The ``.cg1`` +and ``.cg2`` variants generate ``cta_group::1`` and ``cta_group::2`` +flavors of the instruction respectively. + +For more information, refer to the PTX ISA +`<https://docs.nvidia.com/cuda/parallel-thread-execution/#tensor-memory-allocation-and-management-instructions>`_. + Other Intrinsics ---------------- diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td index 9a2f38d760e6591..abbe25bf0040a6f 100644 --- a/llvm/include/llvm/IR/IntrinsicsNVVM.td +++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -48,6 +48,7 @@ def llvm_global_ptr_ty : LLVMQualPointerType<1>; // (global)ptr def llvm_shared_ptr_ty : LLVMQualPointerType<3>; // (shared)ptr +def llvm_tmem_ptr_ty : LLVMQualPointerType<6>; // (tensor memory)ptr // // MISC @@ -5055,4 +5056,33 @@ def int_nvvm_cp_async_bulk_prefetch_L2 def int_nvvm_griddepcontrol_launch_dependents: Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects]>; def int_nvvm_griddepcontrol_wait: Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects]>; +// +// Tcgen05 family of Intrinsics +// + +// Tcgen05 alloc/dealloc related intrinsics + +foreach cta_group = ["cg1", "cg2"] in { + def int_nvvm_tcgen05_alloc_ # cta_group : Intrinsic<[], + [llvm_ptr_ty, // dst_ptr + llvm_i32_ty] , // num_columns + [IntrConvergent, IntrInaccessibleMemOrArgMemOnly, + WriteOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>]>; + + def int_nvvm_tcgen05_alloc_shared_ # cta_group : Intrinsic<[], + [llvm_shared_ptr_ty, // dst_ptr + llvm_i32_ty], // num_columns + [IntrConvergent, IntrInaccessibleMemOrArgMemOnly, + WriteOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>]>; + + def int_nvvm_tcgen05_dealloc_ # cta_group : Intrinsic<[], + [llvm_tmem_ptr_ty, // tmem_addr + llvm_i32_ty], // num_columns + [IntrConvergent, IntrArgMemOnly, + NoCapture<ArgIndex<0>>]>; + + def int_nvvm_tcgen05_relinq_alloc_permit_ # cta_group : Intrinsic<[], [], + [IntrConvergent, IntrInaccessibleMemOnly]>; +} + } // let TargetPrefix = "nvvm" diff --git a/llvm/include/llvm/Support/NVPTXAddrSpace.h b/llvm/include/llvm/Support/NVPTXAddrSpace.h index 93eae39e3d23050..b111dc9a240e41a 100644 --- a/llvm/include/llvm/Support/NVPTXAddrSpace.h +++ b/llvm/include/llvm/Support/NVPTXAddrSpace.h @@ -23,6 +23,7 @@ enum AddressSpace : unsigned { ADDRESS_SPACE_SHARED = 3, ADDRESS_SPACE_CONST = 4, ADDRESS_SPACE_LOCAL = 5, + ADDRESS_SPACE_TENSOR = 6, ADDRESS_SPACE_PARAM = 101, }; diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index 633a99d0fc1be35..74def43d825665b 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -163,6 +163,7 @@ def do_SQRTF32_RN : Predicate<"usePrecSqrtF32()">; def hasHWROT32 : Predicate<"Subtarget->hasHWROT32()">; def noHWROT32 : Predicate<"!Subtarget->hasHWROT32()">; def hasDotInstructions : Predicate<"Subtarget->hasDotInstructions()">; +def hasTcgen05Instructions : Predicate<"Subtarget->hasTcgen05Instructions()">; def True : Predicate<"true">; def False : Predicate<"false">; diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index 56d8b734bf01df8..a0d00e4aac560a5 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -7582,3 +7582,44 @@ def GRIDDEPCONTROL_WAIT : Requires<[hasSM<90>, hasPTX<78>]>; def INT_EXIT : NVPTXInst<(outs), (ins), "exit;", [(int_nvvm_exit)]>; + +// Tcgen05 intrinsics +let isConvergent = true in { + +multiclass TCGEN05_ALLOC_INTR<NVPTXRegClass rc, string AS, string num, Intrinsic Intr> { + def NAME : NVPTXInst<(outs), + (ins rc:$dst, Int32Regs:$ncols), + !strconcat("tcgen05.alloc.cta_group::", num, ".sync.aligned", AS, ".b32 [$dst], $ncols;"), + [(Intr rc:$dst, Int32Regs:$ncols)]>, + Requires<[hasTcgen05Instructions]>; +} + +defm TCGEN05_ALLOC_CG1 : TCGEN05_ALLOC_INTR<Int64Regs, "", "1", int_nvvm_tcgen05_alloc_cg1>; +defm TCGEN05_ALLOC_CG2 : TCGEN05_ALLOC_INTR<Int64Regs, "", "2", int_nvvm_tcgen05_alloc_cg2>; + +defm TCGEN05_ALLOC_S64_CG1 : TCGEN05_ALLOC_INTR<Int64Regs, ".shared::cta", "1", int_nvvm_tcgen05_alloc_shared_cg1>; +defm TCGEN05_ALLOC_S64_CG2 : TCGEN05_ALLOC_INTR<Int64Regs, ".shared::cta", "2", int_nvvm_tcgen05_alloc_shared_cg2>; + +defm TCGEN05_ALLOC_S32_CG1 : TCGEN05_ALLOC_INTR<Int32Regs, ".shared::cta", "1", int_nvvm_tcgen05_alloc_shared_cg1>; +defm TCGEN05_ALLOC_S32_CG2 : TCGEN05_ALLOC_INTR<Int32Regs, ".shared::cta", "2", int_nvvm_tcgen05_alloc_shared_cg2>; + +multiclass TCGEN05_DEALLOC_INTR<string num, Intrinsic Intr> { + def NAME : NVPTXInst<(outs), + (ins Int32Regs:$tmem_addr, Int32Regs:$ncols), + !strconcat("tcgen05.dealloc.cta_group::", num, ".sync.aligned.b32 $tmem_addr, $ncols;"), + [(Intr Int32Regs:$tmem_addr, Int32Regs:$ncols)]>, + Requires<[hasTcgen05Instructions]>; +} +defm TCGEN05_DEALLOC_CG1: TCGEN05_DEALLOC_INTR<"1", int_nvvm_tcgen05_dealloc_cg1>; +defm TCGEN05_DEALLOC_CG2: TCGEN05_DEALLOC_INTR<"2", int_nvvm_tcgen05_dealloc_cg2>; + +multiclass TCGEN05_RELINQ_PERMIT_INTR<string num, Intrinsic Intr> { + def NAME : NVPTXInst<(outs), (ins), + !strconcat("tcgen05.relinquish_alloc_permit.cta_group::", num, ".sync.aligned;"), + [(Intr)]>, + Requires<[hasTcgen05Instructions]>; +} +defm TCGEN05_RELINQ_CG1: TCGEN05_RELINQ_PERMIT_INTR<"1", int_nvvm_tcgen05_relinq_alloc_permit_cg1>; +defm TCGEN05_RELINQ_CG2: TCGEN05_RELINQ_PERMIT_INTR<"2", int_nvvm_tcgen05_relinq_alloc_permit_cg2>; + +} // isConvergent diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h index 919f487c7014162..0c4420b085dc9a0 100644 --- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h +++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h @@ -93,6 +93,21 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo { bool hasDotInstructions() const { return SmVersion >= 61 && PTXVersion >= 50; } + // Tcgen05 instructions in Blackwell family + bool hasTcgen05Instructions() const { + bool HasTcgen05 = false; + switch (FullSmVersion) { + default: + break; + case 1001: // sm_100a + case 1011: // sm_101a + HasTcgen05 = true; + break; + } + + return HasTcgen05 && PTXVersion >= 86; + } + // Prior to CUDA 12.3 ptxas did not recognize that the trap instruction // terminates a basic block. Instead, it would assume that control flow // continued to the next instruction. The next instruction could be in the diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp index e88027f30a03cc3..f2afa6fc20bfa5e 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp @@ -140,6 +140,9 @@ static std::string computeDataLayout(bool is64Bit, bool UseShortPointers) { else if (UseShortPointers) Ret += "-p3:32:32-p4:32:32-p5:32:32"; + // Tensor Memory (addrspace:6) is always 32-bits. + Ret += "-p6:32:32"; + Ret += "-i64:64-i128:128-v16:16-v32:32-n16:32:64"; return Ret; diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-alloc.ll b/llvm/test/CodeGen/NVPTX/tcgen05-alloc.ll new file mode 100644 index 000000000000000..f80b5a5e16ea3cb --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/tcgen05-alloc.ll @@ -0,0 +1,131 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | FileCheck --check-prefixes=CHECK_PTX64 %s +; RUN: llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr | FileCheck --check-prefixes=CHECK_PTX64_SHARED32 %s +; RUN: %if ptxas-12.8 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-12.8 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr | %ptxas-verify -arch=sm_100a %} + +declare void @llvm.nvvm.tcgen05.alloc.cg1(ptr %addr, i32 %ncols) +declare void @llvm.nvvm.tcgen05.alloc.cg2(ptr %addr, i32 %ncols) +declare void @llvm.nvvm.tcgen05.alloc.shared.cg1(ptr addrspace(3) %addr, i32 %ncols) +declare void @llvm.nvvm.tcgen05.alloc.shared.cg2(ptr addrspace(3) %addr, i32 %ncols) + +; CHECK-LABEL: test_tcgen05_alloc +define void @test_tcgen05_alloc(ptr %addr, i32 %ncols) { +; CHECK_PTX64-LABEL: test_tcgen05_alloc( +; CHECK_PTX64: { +; CHECK_PTX64-NEXT: .reg .b32 %r<2>; +; CHECK_PTX64-NEXT: .reg .b64 %rd<2>; +; CHECK_PTX64-EMPTY: +; CHECK_PTX64-NEXT: // %bb.0: +; CHECK_PTX64-NEXT: ld.param.u64 %rd1, [test_tcgen05_alloc_param_0]; +; CHECK_PTX64-NEXT: ld.param.u32 %r1, [test_tcgen05_alloc_param_1]; +; CHECK_PTX64-NEXT: tcgen05.alloc.cta_group::1.sync.aligned.b32 [%rd1], %r1; +; CHECK_PTX64-NEXT: tcgen05.alloc.cta_group::2.sync.aligned.b32 [%rd1], %r1; +; CHECK_PTX64-NEXT: ret; +; +; CHECK_PTX64_SHARED32-LABEL: test_tcgen05_alloc( +; CHECK_PTX64_SHARED32: { +; CHECK_PTX64_SHARED32-NEXT: .reg .b32 %r<2>; +; CHECK_PTX64_SHARED32-NEXT: .reg .b64 %rd<2>; +; CHECK_PTX64_SHARED32-EMPTY: +; CHECK_PTX64_SHARED32-NEXT: // %bb.0: +; CHECK_PTX64_SHARED32-NEXT: ld.param.u64 %rd1, [test_tcgen05_alloc_param_0]; +; CHECK_PTX64_SHARED32-NEXT: ld.param.u32 %r1, [test_tcgen05_alloc_param_1]; +; CHECK_PTX64_SHARED32-NEXT: tcgen05.alloc.cta_group::1.sync.aligned.b32 [%rd1], %r1; +; CHECK_PTX64_SHARED32-NEXT: tcgen05.alloc.cta_group::2.sync.aligned.b32 [%rd1], %r1; +; CHECK_PTX64_SHARED32-NEXT: ret; + call void @llvm.nvvm.tcgen05.alloc.cg1(ptr %addr, i32 %ncols) + call void @llvm.nvvm.tcgen05.alloc.cg2(ptr %addr, i32 %ncols) + + ret void +} + +; CHECK-LABEL: test_tcgen05_alloc_shared +define void @test_tcgen05_alloc_shared(ptr addrspace(3) %addr, i32 %ncols) { +; CHECK_PTX64-LABEL: test_tcgen05_alloc_shared( +; CHECK_PTX64: { +; CHECK_PTX64-NEXT: .reg .b32 %r<2>; +; CHECK_PTX64-NEXT: .reg .b64 %rd<2>; +; CHECK_PTX64-EMPTY: +; CHECK_PTX64-NEXT: // %bb.0: +; CHECK_PTX64-NEXT: ld.param.u64 %rd1, [test_tcgen05_alloc_shared_param_0]; +; CHECK_PTX64-NEXT: ld.param.u32 %r1, [test_tcgen05_alloc_shared_param_1]; +; CHECK_PTX64-NEXT: tcgen05.alloc.cta_group::1.sync.aligned.shared::cta.b32 [%rd1], %r1; +; CHECK_PTX64-NEXT: tcgen05.alloc.cta_group::2.sync.aligned.shared::cta.b32 [%rd1], %r1; +; CHECK_PTX64-NEXT: ret; +; +; CHECK_PTX64_SHARED32-LABEL: test_tcgen05_alloc_shared( +; CHECK_PTX64_SHARED32: { +; CHECK_PTX64_SHARED32-NEXT: .reg .b32 %r<3>; +; CHECK_PTX64_SHARED32-EMPTY: +; CHECK_PTX64_SHARED32-NEXT: // %bb.0: +; CHECK_PTX64_SHARED32-NEXT: ld.param.u32 %r1, [test_tcgen05_alloc_shared_param_0]; +; CHECK_PTX64_SHARED32-NEXT: ld.param.u32 %r2, [test_tcgen05_alloc_shared_param_1]; +; CHECK_PTX64_SHARED32-NEXT: tcgen05.alloc.cta_group::1.sync.aligned.shared::cta.b32 [%r1], %r2; +; CHECK_PTX64_SHARED32-NEXT: tcgen05.alloc.cta_group::2.sync.aligned.shared::cta.b32 [%r1], %r2; +; CHECK_PTX64_SHARED32-NEXT: ret; + call void @llvm.nvvm.tcgen05.alloc.shared.cg1(ptr addrspace(3) %addr, i32 %ncols) + + call void @llvm.nvvm.tcgen05.alloc.shared.cg2(ptr addrspace(3) %addr, i32 %ncols) + ret void +} + +declare void @llvm.nvvm.tcgen05.dealloc.cg1(ptr addrspace(6) %tmem_addr, i32 %ncols) +declare void @llvm.nvvm.tcgen05.dealloc.cg2(ptr addrspace(6) %tmem_addr, i32 %ncols) + +; CHECK-LABEL: test_tcgen05_dealloc +define void @test_tcgen05_dealloc(ptr addrspace(6) %tmem_addr, i32 %ncols) { +; CHECK_PTX64-LABEL: test_tcgen05_dealloc( +; CHECK_PTX64: { +; CHECK_PTX64-NEXT: .reg .b32 %r<3>; +; CHECK_PTX64-EMPTY: +; CHECK_PTX64-NEXT: // %bb.0: +; CHECK_PTX64-NEXT: ld.param.u32 %r1, [test_tcgen05_dealloc_param_0]; +; CHECK_PTX64-NEXT: ld.param.u32 %r2, [test_tcgen05_dealloc_param_1]; +; CHECK_PTX64-NEXT: tcgen05.dealloc.cta_group::1.sync.aligned.b32 %r1, %r2; +; CHECK_PTX64-NEXT: tcgen05.dealloc.cta_group::2.sync.aligned.b32 %r1, %r2; +; CHECK_PTX64-NEXT: ret; +; +; CHECK_PTX64_SHARED32-LABEL: test_tcgen05_dealloc( +; CHECK_PTX64_SHARED32: { +; CHECK_PTX64_SHARED32-NEXT: .reg .b32 %r<3>; +; CHECK_PTX64_SHARED32-EMPTY: +; CHECK_PTX64_SHARED32-NEXT: // %bb.0: +; CHECK_PTX64_SHARED32-NEXT: ld.param.u32 %r1, [test_tcgen05_dealloc_param_0]; +; CHECK_PTX64_SHARED32-NEXT: ld.param.u32 %r2, [test_tcgen05_dealloc_param_1]; +; CHECK_PTX64_SHARED32-NEXT: tcgen05.dealloc.cta_group::1.sync.aligned.b32 %r1, %r2; +; CHECK_PTX64_SHARED32-NEXT: tcgen05.dealloc.cta_group::2.sync.aligned.b32 %r1, %r2; +; CHECK_PTX64_SHARED32-NEXT: ret; + call void @llvm.nvvm.tcgen05.dealloc.cg1(ptr addrspace(6) %tmem_addr, i32 %ncols) + + call void @llvm.nvvm.tcgen05.dealloc.cg2(ptr addrspace(6) %tmem_addr, i32 %ncols) + ret void +} + +declare void @llvm.nvvm.tcgen05.relinq.alloc.permit.cg1() +declare void @llvm.nvvm.tcgen05.relinq.alloc.permit.cg2() + +; CHECK-LABEL: test_tcgen05_relinquish_alloc_permit +define void @test_tcgen05_relinquish_alloc_permit() { +; CHECK_PTX64-LABEL: test_tcgen05_relinquish_alloc_permit( +; CHECK_PTX64: { +; CHECK_PTX64-EMPTY: +; CHECK_PTX64-EMPTY: +; CHECK_PTX64-NEXT: // %bb.0: +; CHECK_PTX64-NEXT: tcgen05.relinquish_alloc_permit.cta_group::1.sync.aligned; +; CHECK_PTX64-NEXT: tcgen05.relinquish_alloc_permit.cta_group::2.sync.aligned; +; CHECK_PTX64-NEXT: ret; +; +; CHECK_PTX64_SHARED32-LABEL: test_tcgen05_relinquish_alloc_permit( +; CHECK_PTX64_SHARED32: { +; CHECK_PTX64_SHARED32-EMPTY: +; CHECK_PTX64_SHARED32-EMPTY: +; CHECK_PTX64_SHARED32-NEXT: // %bb.0: +; CHECK_PTX64_SHARED32-NEXT: tcgen05.relinquish_alloc_permit.cta_group::1.sync.aligned; +; CHECK_PTX64_SHARED32-NEXT: tcgen05.relinquish_alloc_permit.cta_group::2.sync.aligned; +; CHECK_PTX64_SHARED32-NEXT: ret; + call void @llvm.nvvm.tcgen05.relinq.alloc.permit.cg1() + + call void @llvm.nvvm.tcgen05.relinq.alloc.permit.cg2() + ret void +} _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits