================ @@ -0,0 +1,184 @@ +//===-- nvptxintrin.h - NVPTX intrinsic functions -------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef __NVPTXINTRIN_H +#define __NVPTXINTRIN_H + +#ifndef __NVPTX__ +#error "This file is intended for NVPTX targets or offloading to NVPTX +#endif + +#include <stdbool.h> +#include <stdint.h> + +#if defined(__HIP__) || defined(__CUDA__) +#define _DEFAULT_ATTRS __attribute__((device)) __attribute__((always_inline)) +#else +#define _DEFAULT_ATTRS __attribute__((always_inline)) +#endif + +#pragma omp begin declare target device_type(nohost) +#pragma omp begin declare variant match(device = {arch(nvptx64)}) + +// Type aliases to the address spaces used by the NVPTX backend. +#define _private __attribute__((opencl_private)) +#define _constant __attribute__((opencl_constant)) +#define _local __attribute__((opencl_local)) +#define _global __attribute__((opencl_global)) + +// Attribute to declare a function as a kernel. +#define _kernel __attribute__((nvptx_kernel)) + +// Returns the number of CUDA blocks in the 'x' dimension. +_DEFAULT_ATTRS static inline uint32_t _get_num_blocks_x() { + return __nvvm_read_ptx_sreg_nctaid_x(); +} + +// Returns the number of CUDA blocks in the 'y' dimension. +_DEFAULT_ATTRS static inline uint32_t _get_num_blocks_y() { + return __nvvm_read_ptx_sreg_nctaid_y(); +} + +// Returns the number of CUDA blocks in the 'z' dimension. +_DEFAULT_ATTRS static inline uint32_t _get_num_blocks_z() { + return __nvvm_read_ptx_sreg_nctaid_z(); +} + +// Returns the total number of CUDA blocks. +_DEFAULT_ATTRS static inline uint64_t _get_num_blocks() { + return _get_num_blocks_x() * _get_num_blocks_y() * _get_num_blocks_z(); +} + +// Returns the 'x' dimension of the current CUDA block's id. +_DEFAULT_ATTRS static inline uint32_t _get_block_id_x() { + return __nvvm_read_ptx_sreg_ctaid_x(); +} + +// Returns the 'y' dimension of the current CUDA block's id. +_DEFAULT_ATTRS static inline uint32_t _get_block_id_y() { + return __nvvm_read_ptx_sreg_ctaid_y(); +} + +// Returns the 'z' dimension of the current CUDA block's id. +_DEFAULT_ATTRS static inline uint32_t _get_block_id_z() { + return __nvvm_read_ptx_sreg_ctaid_z(); +} + +// Returns the absolute id of the CUDA block. +_DEFAULT_ATTRS static inline uint64_t _get_block_id() { + return _get_block_id_x() + _get_num_blocks_x() * _get_block_id_y() + + _get_num_blocks_x() * _get_num_blocks_y() * _get_block_id_z(); +} + +// Returns the number of CUDA threads in the 'x' dimension. +_DEFAULT_ATTRS static inline uint32_t _get_num_threads_x() { + return __nvvm_read_ptx_sreg_ntid_x(); +} + +// Returns the number of CUDA threads in the 'y' dimension. +_DEFAULT_ATTRS static inline uint32_t _get_num_threads_y() { + return __nvvm_read_ptx_sreg_ntid_y(); +} + +// Returns the number of CUDA threads in the 'z' dimension. +_DEFAULT_ATTRS static inline uint32_t _get_num_threads_z() { + return __nvvm_read_ptx_sreg_ntid_z(); +} + +// Returns the total number of threads in the block. +_DEFAULT_ATTRS static inline uint64_t _get_num_threads() { + return _get_num_threads_x() * _get_num_threads_y() * _get_num_threads_z(); +} + +// Returns the 'x' dimension id of the thread in the current CUDA block. +_DEFAULT_ATTRS static inline uint32_t _get_thread_id_x() { + return __nvvm_read_ptx_sreg_tid_x(); +} + +// Returns the 'y' dimension id of the thread in the current CUDA block. +_DEFAULT_ATTRS static inline uint32_t _get_thread_id_y() { + return __nvvm_read_ptx_sreg_tid_y(); +} + +// Returns the 'z' dimension id of the thread in the current CUDA block. +_DEFAULT_ATTRS static inline uint32_t _get_thread_id_z() { + return __nvvm_read_ptx_sreg_tid_z(); +} + +// Returns the absolute id of the thread in the current CUDA block. +_DEFAULT_ATTRS static inline uint64_t _get_thread_id() { + return _get_thread_id_x() + _get_num_threads_x() * _get_thread_id_y() + + _get_num_threads_x() * _get_num_threads_y() * _get_thread_id_z(); +} + +// Returns the size of a CUDA warp, always 32 on NVIDIA hardware. +_DEFAULT_ATTRS static inline uint32_t _get_lane_size() { return 32; } ---------------- jdoerfert wrote:
Can we use the macro/builtin for this? https://github.com/llvm/llvm-project/pull/110179 _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits