https://github.com/jhuber6 updated https://github.com/llvm/llvm-project/pull/79768
>From 2c7049defef3b62de7017640948cccfb07ff756c Mon Sep 17 00:00:00 2001 From: Joseph Huber <hube...@outlook.com> Date: Sun, 28 Jan 2024 14:57:05 -0600 Subject: [PATCH 1/3] [NVPTX] Add 'activemask' builtin and intrinsic support Summary: This patch adds support for getting the 'activemask' instruction's value without needing to use inline assembly. See the relevant PTX reference for details. https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-activemask --- clang/include/clang/Basic/BuiltinsNVPTX.def | 8 ++++- clang/test/CodeGen/builtins-nvptx.c | 16 ++++++--- llvm/include/llvm/IR/IntrinsicsNVVM.td | 8 +++++ llvm/lib/Target/NVPTX/NVPTX.td | 4 +-- llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 6 ++++ llvm/test/CodeGen/NVPTX/activemask.ll | 38 +++++++++++++++++++++ 6 files changed, 73 insertions(+), 7 deletions(-) create mode 100644 llvm/test/CodeGen/NVPTX/activemask.ll diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.def b/clang/include/clang/Basic/BuiltinsNVPTX.def index 0f2e8260143be78..506288547a15822 100644 --- a/clang/include/clang/Basic/BuiltinsNVPTX.def +++ b/clang/include/clang/Basic/BuiltinsNVPTX.def @@ -44,6 +44,7 @@ #pragma push_macro("PTX42") #pragma push_macro("PTX60") #pragma push_macro("PTX61") +#pragma push_macro("PTX62") #pragma push_macro("PTX63") #pragma push_macro("PTX64") #pragma push_macro("PTX65") @@ -76,7 +77,8 @@ #define PTX65 "ptx65|" PTX70 #define PTX64 "ptx64|" PTX65 #define PTX63 "ptx63|" PTX64 -#define PTX61 "ptx61|" PTX63 +#define PTX62 "ptx62|" PTX63 +#define PTX61 "ptx61|" PTX62 #define PTX60 "ptx60|" PTX61 #define PTX42 "ptx42|" PTX60 @@ -632,6 +634,9 @@ TARGET_BUILTIN(__nvvm_vote_any_sync, "bUib", "", PTX60) TARGET_BUILTIN(__nvvm_vote_uni_sync, "bUib", "", PTX60) TARGET_BUILTIN(__nvvm_vote_ballot_sync, "UiUib", "", PTX60) +// Mask +TARGET_BUILTIN(__nvvm_activemask, "i", "n", PTX62) + // Match TARGET_BUILTIN(__nvvm_match_any_sync_i32, "UiUiUi", "", AND(SM_70,PTX60)) TARGET_BUILTIN(__nvvm_match_any_sync_i64, "UiUiWi", "", AND(SM_70,PTX60)) @@ -1065,6 +1070,7 @@ TARGET_BUILTIN(__nvvm_getctarank_shared_cluster, "iv*3", "", AND(SM_90,PTX78)) #pragma pop_macro("PTX42") #pragma pop_macro("PTX60") #pragma pop_macro("PTX61") +#pragma pop_macro("PTX62") #pragma pop_macro("PTX63") #pragma pop_macro("PTX64") #pragma pop_macro("PTX65") diff --git a/clang/test/CodeGen/builtins-nvptx.c b/clang/test/CodeGen/builtins-nvptx.c index 353f3ebb608c2b1..a2e73eb1d268bd1 100644 --- a/clang/test/CodeGen/builtins-nvptx.c +++ b/clang/test/CodeGen/builtins-nvptx.c @@ -5,16 +5,16 @@ // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_80 -target-feature +ptx70 \ // RUN: -fcuda-is-device -S -emit-llvm -o - -x cuda %s \ // RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX70_SM80 -check-prefix=LP64 %s -// RUN: %clang_cc1 -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu sm_60 \ +// RUN: %clang_cc1 -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu sm_60 -target-feature +ptx62 \ // RUN: -fcuda-is-device -S -emit-llvm -o - -x cuda %s \ // RUN: | FileCheck -check-prefix=CHECK -check-prefix=LP32 %s -// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_60 \ +// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_60 -target-feature +ptx62 \ // RUN: -fcuda-is-device -S -emit-llvm -o - -x cuda %s \ // RUN: | FileCheck -check-prefix=CHECK -check-prefix=LP64 %s -// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_61 \ +// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_61 -target-feature +ptx62 \ // RUN: -fcuda-is-device -S -emit-llvm -o - -x cuda %s \ // RUN: | FileCheck -check-prefix=CHECK -check-prefix=LP64 %s -// RUN: %clang_cc1 -triple nvptx-unknown-unknown -target-cpu sm_53 \ +// RUN: %clang_cc1 -triple nvptx-unknown-unknown -target-cpu sm_53 -target-feature +ptx62 \ // RUN: -DERROR_CHECK -fcuda-is-device -S -o /dev/null -x cuda -verify %s // RUN: %clang_cc1 -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu sm_86 -target-feature +ptx72 \ // RUN: -fcuda-is-device -S -emit-llvm -o - -x cuda %s \ @@ -165,6 +165,14 @@ __device__ void sync() { } +__device__ void activemask() { + +// CHECK: call i32 @llvm.nvvm.activemask() + + __nvvm_activemask(); + +} + // NVVM intrinsics diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td index 5a5ba2592e1467e..0640fb1f74aa5eb 100644 --- a/llvm/include/llvm/IR/IntrinsicsNVVM.td +++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -4599,6 +4599,14 @@ def int_nvvm_vote_ballot_sync : [IntrInaccessibleMemOnly, IntrConvergent, IntrNoCallback], "llvm.nvvm.vote.ballot.sync">, ClangBuiltin<"__nvvm_vote_ballot_sync">; +// +// ACTIVEMASK +// +def int_nvvm_activemask : + Intrinsic<[llvm_i32_ty], [], + [IntrInaccessibleMemOnly, IntrConvergent, IntrNoCallback], "llvm.nvvm.activemask">, + ClangBuiltin<"__nvvm_activemask">; + // // MATCH.SYNC // diff --git a/llvm/lib/Target/NVPTX/NVPTX.td b/llvm/lib/Target/NVPTX/NVPTX.td index f2a4ce381b40b48..a2233d3882b236d 100644 --- a/llvm/lib/Target/NVPTX/NVPTX.td +++ b/llvm/lib/Target/NVPTX/NVPTX.td @@ -40,7 +40,7 @@ foreach sm = [20, 21, 30, 32, 35, 37, 50, 52, 53, def SM90a: FeatureSM<"90a", 901>; -foreach version = [32, 40, 41, 42, 43, 50, 60, 61, 63, 64, 65, +foreach version = [32, 40, 41, 42, 43, 50, 60, 61, 62, 63, 64, 65, 70, 71, 72, 73, 74, 75, 76, 77, 78, 80, 81, 82, 83] in def PTX#version: FeaturePTX<version>; @@ -65,7 +65,7 @@ def : Proc<"sm_61", [SM61, PTX50]>; def : Proc<"sm_62", [SM62, PTX50]>; def : Proc<"sm_70", [SM70, PTX60]>; def : Proc<"sm_72", [SM72, PTX61]>; -def : Proc<"sm_75", [SM75, PTX63]>; +def : Proc<"sm_75", [SM75, PTX62, PTX63]>; def : Proc<"sm_80", [SM80, PTX70]>; def : Proc<"sm_86", [SM86, PTX71]>; def : Proc<"sm_87", [SM87, PTX74]>; diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index 33f1e4a43e072af..2df931597616566 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -263,6 +263,12 @@ multiclass MATCH_ANY_SYNC<NVPTXRegClass regclass, string ptxtype, Intrinsic IntO Requires<[hasPTX<60>, hasSM<70>]>; } +// activemask.b32 +def ACTIVEMASK : NVPTXInst<(outs Int32Regs:$dest), (ins), + "activemask.b32 \t$dest;", + [(set Int32Regs:$dest, (int_nvvm_activemask))]>, + Requires<[hasPTX<62>, hasSM<30>]>; + defm MATCH_ANY_SYNC_32 : MATCH_ANY_SYNC<Int32Regs, "b32", int_nvvm_match_any_sync_i32, i32imm>; defm MATCH_ANY_SYNC_64 : MATCH_ANY_SYNC<Int64Regs, "b64", int_nvvm_match_any_sync_i64, diff --git a/llvm/test/CodeGen/NVPTX/activemask.ll b/llvm/test/CodeGen/NVPTX/activemask.ll new file mode 100644 index 000000000000000..1496b2ebdd44270 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/activemask.ll @@ -0,0 +1,38 @@ +; RUN: llc < %s -march=nvptx64 -O2 -mcpu=sm_52 -mattr=+ptx62 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_52 -mattr=+ptx62 | %ptxas-verify %} + +declare i32 @llvm.nvvm.activemask() + +; CHECK-LABEL: activemask( +; +; CHECK: activemask.b32 %[[REG:.+]]; +; CHECK-NEXT: st.param.b32 [func_retval0+0], %[[REG]]; +; CHECK-NEXT: ret; +define dso_local i32 @activemask() { +entry: + %mask = call i32 @llvm.nvvm.activemask() + ret i32 %mask +} + +; CHECK-LABEL: convergent( +; +; CHECK: activemask.b32 %[[REG:.+]]; +; CHECK: activemask.b32 %[[REG]]; +; CHECK: .param.b32 [func_retval0+0], %[[REG]]; +; CHECK-NEXT: ret; +define dso_local i32 @convergent(i1 %cond) { +entry: + br i1 %cond, label %if.else, label %if.then + +if.then: + %0 = call i32 @llvm.nvvm.activemask() + br label %if.end + +if.else: + %1 = call i32 @llvm.nvvm.activemask() + br label %if.end + +if.end: + %mask = phi i32 [ %0, %if.then ], [ %1, %if.else ] + ret i32 %mask +} >From 04a1b8423549ece195941d6a92555fb104bf05e8 Mon Sep 17 00:00:00 2001 From: Joseph Huber <hube...@outlook.com> Date: Mon, 29 Jan 2024 13:10:54 -0600 Subject: [PATCH 2/3] AddHasSideEffects --- llvm/include/llvm/IR/IntrinsicsNVVM.td | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td index 0640fb1f74aa5eb..542bbf7f9234cb6 100644 --- a/llvm/include/llvm/IR/IntrinsicsNVVM.td +++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -4604,7 +4604,7 @@ def int_nvvm_vote_ballot_sync : // def int_nvvm_activemask : Intrinsic<[llvm_i32_ty], [], - [IntrInaccessibleMemOnly, IntrConvergent, IntrNoCallback], "llvm.nvvm.activemask">, + [IntrInaccessibleMemOnly, IntrConvergent, IntrNoCallback, IntrHasSideEffects], "llvm.nvvm.activemask">, ClangBuiltin<"__nvvm_activemask">; // >From b5f36a7c60613a4e40bfe406089f2a2396240063 Mon Sep 17 00:00:00 2001 From: Joseph Huber <hube...@outlook.com> Date: Mon, 29 Jan 2024 13:56:44 -0600 Subject: [PATCH 3/3] Update PTX version according to Artems instruction --- llvm/lib/Target/NVPTX/NVPTX.td | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/NVPTX/NVPTX.td b/llvm/lib/Target/NVPTX/NVPTX.td index a2233d3882b236d..6aa98543e5e22ea 100644 --- a/llvm/lib/Target/NVPTX/NVPTX.td +++ b/llvm/lib/Target/NVPTX/NVPTX.td @@ -65,7 +65,7 @@ def : Proc<"sm_61", [SM61, PTX50]>; def : Proc<"sm_62", [SM62, PTX50]>; def : Proc<"sm_70", [SM70, PTX60]>; def : Proc<"sm_72", [SM72, PTX61]>; -def : Proc<"sm_75", [SM75, PTX62, PTX63]>; +def : Proc<"sm_75", [SM75, PTX63]>; def : Proc<"sm_80", [SM80, PTX70]>; def : Proc<"sm_86", [SM86, PTX71]>; def : Proc<"sm_87", [SM87, PTX74]>; _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits