[clang] clang/AMDGPU: Add __builtin_amdgcn_inverse_ballot_w{32,64} (PR #155724)

Nicolai Hähnle via cfe-commits Wed, 27 Aug 2025 16:54:33 -0700

https://github.com/nhaehnle updated 
https://github.com/llvm/llvm-project/pull/155724


From cbebbccc6c0638e396ab0c03106aafaf531ec66f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= <nicolai.haeh...@amd.com>
Date: Wed, 27 Aug 2025 16:08:20 -0700
Subject: [PATCH] clang/AMDGPU: Add __builtin_amdgcn_inverse_ballot_w{32,64}

Add builtins that expose the underlying llvm.amdgcn.inverse.ballot
intrinsic that we've had for a while.

This allows more explicitly writing code that selects or branches in
terms of lane masks, which can lead to better code quality.
---
 clang/docs/LanguageExtensions.rst               | 17 +++++++++++++++++
 clang/include/clang/Basic/BuiltinsAMDGPU.def    |  3 +++
 clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp     |  7 +++++++
 .../CodeGenOpenCL/builtins-amdgcn-wave32.cl     |  7 +++++++
 .../CodeGenOpenCL/builtins-amdgcn-wave64.cl     |  7 +++++++
 5 files changed, 41 insertions(+)

diff --git a/clang/docs/LanguageExtensions.rst 
b/clang/docs/LanguageExtensions.rst
index a13e0a5952fe4..2ce60de05fff2 100644
--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@@ -5162,6 +5162,23 @@ If no address spaces names are provided, all address 
spaces are fenced.
   __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup", "local")
   __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup", "local", "global")
 
+__builtin_amdgcn_ballot_w{32,64}
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+``__builtin_amdgcn_ballot_w{32,64}`` returns a bitmask that contains its
+boolean argument as a bit for every lane of the current wave that is currently
+active (i.e., that is converged with the executing thread), and a 0 bit for
+every lane that is not active.
+
+The result is uniform, i.e. it is the same in every active thread of the wave.
+
+__builtin_amdgcn_inverse_ballot_w{32,64}
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Given a wave-uniform bitmask, 
``__builtin_amdgcn_inverse_ballot_w{32,64}(mask)``
+returns the bit at the position of the current lane. It is almost equivalent to
+``(mask & (1 << lane_id)) != 0``, except that its behavior is only defined if
+the given mask has the same value for all active lanes of the current wave.
 
 ARM/AArch64 Language Extensions
 -------------------------------
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def 
b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index f8f55772db8fe..6f5d1e024b91d 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -183,6 +183,9 @@ TARGET_BUILTIN(__builtin_amdgcn_struct_ptr_buffer_load_lds, 
"vQbv*3IUiiiiIiIi",
 TARGET_BUILTIN(__builtin_amdgcn_ballot_w32, "ZUib", "nc", "wavefrontsize32")
 BUILTIN(__builtin_amdgcn_ballot_w64, "WUib", "nc")
 
+TARGET_BUILTIN(__builtin_amdgcn_inverse_ballot_w32, "bZUi", "nc", 
"wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_inverse_ballot_w64, "bWUi", "nc", 
"wavefrontsize64")
+
 // Deprecated intrinsics in favor of __builtin_amdgn_ballot_{w32|w64}
 BUILTIN(__builtin_amdgcn_uicmp, "WUiUiUiIi", "nc")
 BUILTIN(__builtin_amdgcn_uicmpl, "WUiWUiWUiIi", "nc")
diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp 
b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
index dad1f95ac710d..433d76b2812db 100644
--- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
@@ -504,6 +504,13 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
     Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_ballot, { ResultType });
     return Builder.CreateCall(F, { Src });
   }
+  case AMDGPU::BI__builtin_amdgcn_inverse_ballot_w32:
+  case AMDGPU::BI__builtin_amdgcn_inverse_ballot_w64: {
+    llvm::Value *Src = EmitScalarExpr(E->getArg(0));
+    Function *F =
+        CGM.getIntrinsic(Intrinsic::amdgcn_inverse_ballot, {Src->getType()});
+    return Builder.CreateCall(F, {Src});
+  }
   case AMDGPU::BI__builtin_amdgcn_tanhf:
   case AMDGPU::BI__builtin_amdgcn_tanhh:
   case AMDGPU::BI__builtin_amdgcn_tanh_bf16:
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-wave32.cl 
b/clang/test/CodeGenOpenCL/builtins-amdgcn-wave32.cl
index 5e587cb87e073..d390418523694 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-wave32.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-wave32.cl
@@ -24,6 +24,13 @@ void test_ballot_wave32_target_attr(global uint* out, int a, 
int b)
   *out = __builtin_amdgcn_ballot_w32(a == b);
 }
 
+// CHECK-LABEL: @test_inverse_ballot_wave32(
+// CHECK: call i1 @llvm.amdgcn.inverse.ballot.i32(i32 %{{.+}})
+void test_inverse_ballot_wave32(global bool* out, int a)
+{
+  *out = __builtin_amdgcn_inverse_ballot_w32(a);
+}
+
 // CHECK-LABEL: @test_read_exec(
 // CHECK: call i32 @llvm.amdgcn.ballot.i32(i1 true)
 void test_read_exec(global uint* out) {
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-wave64.cl 
b/clang/test/CodeGenOpenCL/builtins-amdgcn-wave64.cl
index 1fc2ac0d3141e..d851ec7e6734f 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-wave64.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-wave64.cl
@@ -23,6 +23,13 @@ void test_ballot_wave64_target_attr(global ulong* out, int 
a, int b)
   *out = __builtin_amdgcn_ballot_w64(a == b);
 }
 
+// CHECK-LABEL: @test_inverse_ballot_wave64(
+// CHECK: call i1 @llvm.amdgcn.inverse.ballot.i64(i64 %{{.+}})
+void test_inverse_ballot_wave64(global bool* out, ulong a)
+{
+  *out = __builtin_amdgcn_inverse_ballot_w64(a);
+}
+
 // CHECK-LABEL: @test_read_exec(
 // CHECK: call i64 @llvm.amdgcn.ballot.i64(i1 true)
 void test_read_exec(global ulong* out) {

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] clang/AMDGPU: Add __builtin_amdgcn_inverse_ballot_w{32,64} (PR #155724)

Reply via email to