llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-backend-amdgpu @llvm/pr-subscribers-libc Author: Joseph Huber (jhuber6) <details> <summary>Changes</summary> Summary: These helpers are very useful but currently absent. They allow the user to get a bitmask representing the matches within the warp. I have made an executive decision to drop the `predicate` return from `match_all` because it's easily testable with `match_all() == __activemask()`. --- Full diff: https://github.com/llvm/llvm-project/pull/127504.diff 5 Files Affected: - (modified) clang/lib/Headers/amdgpuintrin.h (+56) - (modified) clang/lib/Headers/nvptxintrin.h (+74) - (modified) libc/src/__support/GPU/utils.h (+8) - (modified) libc/test/integration/src/__support/GPU/CMakeLists.txt (+9) - (added) libc/test/integration/src/__support/GPU/match.cpp (+32) ``````````diff diff --git a/clang/lib/Headers/amdgpuintrin.h b/clang/lib/Headers/amdgpuintrin.h index 9dad99ffe9439..355e75d0b2d42 100644 --- a/clang/lib/Headers/amdgpuintrin.h +++ b/clang/lib/Headers/amdgpuintrin.h @@ -162,6 +162,62 @@ __gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x, ((uint64_t)__gpu_shuffle_idx_u32(__lane_mask, __idx, __lo, __width)); } +// Returns a bitmask marking all lanes that have the same value of __x. +_DEFAULT_FN_ATTRS static __inline__ uint64_t +__gpu_match_any_u32(uint64_t __lane_mask, uint32_t __x) { + uint32_t __match_mask = 0; + + bool __done = 0; + while (__gpu_ballot(__lane_mask, !__done)) { + if (!__done) { + uint32_t __first = __gpu_read_first_lane_u32(__lane_mask, __x); + if (__first == __x) { + __match_mask = __gpu_lane_mask(); + __done = 1; + } + } + } + __gpu_sync_lane(__lane_mask); + return __match_mask; +} + +// Returns a bitmask marking all lanes that have the same value of __x. +_DEFAULT_FN_ATTRS static __inline__ uint64_t +__gpu_match_any_u64(uint64_t __lane_mask, uint64_t __x) { + uint64_t __match_mask = 0; + + bool __done = 0; + while (__gpu_ballot(__lane_mask, __done)) { + if (!__done) { + uint64_t __first = __gpu_read_first_lane_u64(__lane_mask, __x); + if (__first == __x) { + __match_mask = __gpu_lane_mask(); + __done = 1; + } + } + } + __gpu_sync_lane(__lane_mask); + return __match_mask; +} + +// Returns the current lane mask if every lane contains __x. +_DEFAULT_FN_ATTRS static __inline__ uint64_t +__gpu_match_all_u32(uint64_t __lane_mask, uint32_t __x) { + uint32_t __first = __gpu_read_first_lane_u64(__lane_mask, __x); + uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first); + __gpu_sync_lane(__lane_mask); + return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull; +} + +// Returns the current lane mask if every lane contains __x. +_DEFAULT_FN_ATTRS static __inline__ uint64_t +__gpu_match_all_u64(uint64_t __lane_mask, uint64_t __x) { + uint64_t __first = __gpu_read_first_lane_u64(__lane_mask, __x); + uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first); + __gpu_sync_lane(__lane_mask); + return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull; +} + // Returns true if the flat pointer points to AMDGPU 'shared' memory. _DEFAULT_FN_ATTRS static __inline__ bool __gpu_is_ptr_local(void *ptr) { return __builtin_amdgcn_is_shared((void [[clang::address_space(0)]] *)(( diff --git a/clang/lib/Headers/nvptxintrin.h b/clang/lib/Headers/nvptxintrin.h index 40fa2edebe975..f857a87b5f4c7 100644 --- a/clang/lib/Headers/nvptxintrin.h +++ b/clang/lib/Headers/nvptxintrin.h @@ -13,6 +13,10 @@ #error "This file is intended for NVPTX targets or offloading to NVPTX" #endif +#ifndef __CUDA_ARCH__ +#define __CUDA_ARCH__ 0 +#endif + #include <stdint.h> #if !defined(__cplusplus) @@ -168,6 +172,76 @@ __gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x, ((uint64_t)__gpu_shuffle_idx_u32(__mask, __idx, __lo, __width)); } +// Returns a bitmask marking all lanes that have the same value of __x. +_DEFAULT_FN_ATTRS static __inline__ uint64_t +__gpu_match_any_u32(uint64_t __lane_mask, uint32_t __x) { + // Newer targets can use the dedicated CUDA support. + if (__CUDA_ARCH__ >= 700 || __nvvm_reflect("__CUDA_ARCH") >= 700) + return __nvvm_match_any_sync_i32(__lane_mask, __x); + + uint32_t __match_mask = 0; + bool __done = 0; + while (__gpu_ballot(__lane_mask, !__done)) { + if (!__done) { + uint32_t __first = __gpu_read_first_lane_u32(__lane_mask, __x); + if (__first == __x) { + __match_mask = __gpu_lane_mask(); + __done = 1; + } + } + } + return __match_mask; +} + +// Returns a bitmask marking all lanes that have the same value of __x. +_DEFAULT_FN_ATTRS static __inline__ uint64_t +__gpu_match_any_u64(uint64_t __lane_mask, uint64_t __x) { + // Newer targets can use the dedicated CUDA support. + if (__CUDA_ARCH__ >= 700 || __nvvm_reflect("__CUDA_ARCH") >= 700) + return __nvvm_match_any_sync_i64(__lane_mask, __x); + + uint64_t __match_mask = 0; + + bool __done = 0; + while (__gpu_ballot(__lane_mask, __done)) { + if (!__done) { + uint64_t __first = __gpu_read_first_lane_u64(__lane_mask, __x); + if (__first == __x) { + __match_mask = __gpu_lane_mask(); + __done = 1; + } + } + } + __gpu_sync_lane(__lane_mask); + return __match_mask; +} + +// Returns the current lane mask if every lane contains __x. +_DEFAULT_FN_ATTRS static __inline__ uint64_t +__gpu_match_all_u32(uint64_t __lane_mask, uint32_t __x) { + // Newer targets can use the dedicated CUDA support. + int predicate; + if (__CUDA_ARCH__ >= 700 || __nvvm_reflect("__CUDA_ARCH") >= 700) + return __nvvm_match_all_sync_i32p(__lane_mask, __x, &predicate); + + uint32_t __first = __gpu_read_first_lane_u64(__lane_mask, __x); + uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first); + return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull; +} + +// Returns the current lane mask if every lane contains __x. +_DEFAULT_FN_ATTRS static __inline__ uint64_t +__gpu_match_all_u64(uint64_t __lane_mask, uint64_t __x) { + // Newer targets can use the dedicated CUDA support. + int predicate; + if (__CUDA_ARCH__ >= 700 || __nvvm_reflect("__CUDA_ARCH") >= 700) + return __nvvm_match_all_sync_i64p(__lane_mask, __x, &predicate); + + uint64_t __first = __gpu_read_first_lane_u64(__lane_mask, __x); + uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first); + return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull; +} + // Returns true if the flat pointer points to CUDA 'shared' memory. _DEFAULT_FN_ATTRS static __inline__ bool __gpu_is_ptr_local(void *ptr) { return __nvvm_isspacep_shared(ptr); diff --git a/libc/src/__support/GPU/utils.h b/libc/src/__support/GPU/utils.h index 323c003f1ff07..0fd3a6498b865 100644 --- a/libc/src/__support/GPU/utils.h +++ b/libc/src/__support/GPU/utils.h @@ -92,6 +92,14 @@ LIBC_INLINE uint32_t shuffle(uint64_t lane_mask, uint32_t idx, uint32_t x, return __gpu_shuffle_idx_u32(lane_mask, idx, x, width); } +LIBC_INLINE uint64_t match_any(uint64_t lane_mask, uint32_t x) { + return __gpu_match_any_u32(lane_mask, x); +} + +LIBC_INLINE uint64_t match_all(uint64_t lane_mask, uint32_t x) { + return __gpu_match_all_u32(lane_mask, x); +} + [[noreturn]] LIBC_INLINE void end_program() { __gpu_exit(); } LIBC_INLINE bool is_first_lane(uint64_t lane_mask) { diff --git a/libc/test/integration/src/__support/GPU/CMakeLists.txt b/libc/test/integration/src/__support/GPU/CMakeLists.txt index 68bbc3849bc7e..e066830f6cc0d 100644 --- a/libc/test/integration/src/__support/GPU/CMakeLists.txt +++ b/libc/test/integration/src/__support/GPU/CMakeLists.txt @@ -18,3 +18,12 @@ add_integration_test( LOADER_ARGS --threads 64 ) + +add_integration_test( + match_test + SUITE libc-support-gpu-tests + SRCS + match.cpp + LOADER_ARGS + --threads 64 +) diff --git a/libc/test/integration/src/__support/GPU/match.cpp b/libc/test/integration/src/__support/GPU/match.cpp new file mode 100644 index 0000000000000..225078022cdc3 --- /dev/null +++ b/libc/test/integration/src/__support/GPU/match.cpp @@ -0,0 +1,32 @@ +//===-- Test for the shuffle operations on the GPU ------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/__support/CPP/bit.h" +#include "src/__support/GPU/utils.h" +#include "test/IntegrationTest/test.h" + +using namespace LIBC_NAMESPACE; + +// Test to ensure that match any / match all work. +static void test_match() { + uint64_t mask = gpu::get_lane_mask(); + EXPECT_EQ(0ull, gpu::match_any(mask, gpu::get_lane_id())); + EXPECT_EQ(mask, gpu::match_any(mask, 1)); + EXPECT_EQ(0xffff, gpu::match_any(mask, gpu::get_lane_id() < 16)); + EXPECT_EQ(mask, gpu::match_all(mask, 1)); + EXPECT_EQ(0ull, gpu::match_any(mask, gpu::get_lane_id())); +} + +TEST_MAIN(int argc, char **argv, char **envp) { + if (gpu::get_thread_id() >= gpu::get_lane_size()) + return 0; + + test_match(); + + return 0; +} `````````` </details> https://github.com/llvm/llvm-project/pull/127504 _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits