https://github.com/nhaehnle updated https://github.com/llvm/llvm-project/pull/155724
From cbebbccc6c0638e396ab0c03106aafaf531ec66f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= <nicolai.haeh...@amd.com> Date: Wed, 27 Aug 2025 16:08:20 -0700 Subject: [PATCH] clang/AMDGPU: Add __builtin_amdgcn_inverse_ballot_w{32,64} Add builtins that expose the underlying llvm.amdgcn.inverse.ballot intrinsic that we've had for a while. This allows more explicitly writing code that selects or branches in terms of lane masks, which can lead to better code quality. --- clang/docs/LanguageExtensions.rst | 17 +++++++++++++++++ clang/include/clang/Basic/BuiltinsAMDGPU.def | 3 +++ clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp | 7 +++++++ .../CodeGenOpenCL/builtins-amdgcn-wave32.cl | 7 +++++++ .../CodeGenOpenCL/builtins-amdgcn-wave64.cl | 7 +++++++ 5 files changed, 41 insertions(+) diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst index a13e0a5952fe4..2ce60de05fff2 100644 --- a/clang/docs/LanguageExtensions.rst +++ b/clang/docs/LanguageExtensions.rst @@ -5162,6 +5162,23 @@ If no address spaces names are provided, all address spaces are fenced. __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup", "local") __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup", "local", "global") +__builtin_amdgcn_ballot_w{32,64} +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +``__builtin_amdgcn_ballot_w{32,64}`` returns a bitmask that contains its +boolean argument as a bit for every lane of the current wave that is currently +active (i.e., that is converged with the executing thread), and a 0 bit for +every lane that is not active. + +The result is uniform, i.e. it is the same in every active thread of the wave. + +__builtin_amdgcn_inverse_ballot_w{32,64} +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Given a wave-uniform bitmask, ``__builtin_amdgcn_inverse_ballot_w{32,64}(mask)`` +returns the bit at the position of the current lane. It is almost equivalent to +``(mask & (1 << lane_id)) != 0``, except that its behavior is only defined if +the given mask has the same value for all active lanes of the current wave. ARM/AArch64 Language Extensions ------------------------------- diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index f8f55772db8fe..6f5d1e024b91d 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -183,6 +183,9 @@ TARGET_BUILTIN(__builtin_amdgcn_struct_ptr_buffer_load_lds, "vQbv*3IUiiiiIiIi", TARGET_BUILTIN(__builtin_amdgcn_ballot_w32, "ZUib", "nc", "wavefrontsize32") BUILTIN(__builtin_amdgcn_ballot_w64, "WUib", "nc") +TARGET_BUILTIN(__builtin_amdgcn_inverse_ballot_w32, "bZUi", "nc", "wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_inverse_ballot_w64, "bWUi", "nc", "wavefrontsize64") + // Deprecated intrinsics in favor of __builtin_amdgn_ballot_{w32|w64} BUILTIN(__builtin_amdgcn_uicmp, "WUiUiUiIi", "nc") BUILTIN(__builtin_amdgcn_uicmpl, "WUiWUiWUiIi", "nc") diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp index dad1f95ac710d..433d76b2812db 100644 --- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp @@ -504,6 +504,13 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_ballot, { ResultType }); return Builder.CreateCall(F, { Src }); } + case AMDGPU::BI__builtin_amdgcn_inverse_ballot_w32: + case AMDGPU::BI__builtin_amdgcn_inverse_ballot_w64: { + llvm::Value *Src = EmitScalarExpr(E->getArg(0)); + Function *F = + CGM.getIntrinsic(Intrinsic::amdgcn_inverse_ballot, {Src->getType()}); + return Builder.CreateCall(F, {Src}); + } case AMDGPU::BI__builtin_amdgcn_tanhf: case AMDGPU::BI__builtin_amdgcn_tanhh: case AMDGPU::BI__builtin_amdgcn_tanh_bf16: diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-wave32.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-wave32.cl index 5e587cb87e073..d390418523694 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-wave32.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-wave32.cl @@ -24,6 +24,13 @@ void test_ballot_wave32_target_attr(global uint* out, int a, int b) *out = __builtin_amdgcn_ballot_w32(a == b); } +// CHECK-LABEL: @test_inverse_ballot_wave32( +// CHECK: call i1 @llvm.amdgcn.inverse.ballot.i32(i32 %{{.+}}) +void test_inverse_ballot_wave32(global bool* out, int a) +{ + *out = __builtin_amdgcn_inverse_ballot_w32(a); +} + // CHECK-LABEL: @test_read_exec( // CHECK: call i32 @llvm.amdgcn.ballot.i32(i1 true) void test_read_exec(global uint* out) { diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-wave64.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-wave64.cl index 1fc2ac0d3141e..d851ec7e6734f 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-wave64.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-wave64.cl @@ -23,6 +23,13 @@ void test_ballot_wave64_target_attr(global ulong* out, int a, int b) *out = __builtin_amdgcn_ballot_w64(a == b); } +// CHECK-LABEL: @test_inverse_ballot_wave64( +// CHECK: call i1 @llvm.amdgcn.inverse.ballot.i64(i64 %{{.+}}) +void test_inverse_ballot_wave64(global bool* out, ulong a) +{ + *out = __builtin_amdgcn_inverse_ballot_w64(a); +} + // CHECK-LABEL: @test_read_exec( // CHECK: call i64 @llvm.amdgcn.ballot.i64(i1 true) void test_read_exec(global ulong* out) { _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits