https://github.com/JonChesterfield updated https://github.com/llvm/llvm-project/pull/131190
>From b52a04c55ad56e1172dec6262f2536ec3fe7162b Mon Sep 17 00:00:00 2001 From: Jon Chesterfield <jonathanchesterfi...@gmail.com> Date: Wed, 12 Mar 2025 20:55:17 +0000 Subject: [PATCH] [SPIRV] GPU intrinsics --- clang/include/clang/Basic/Builtins.td | 29 + clang/lib/Headers/amdgpuintrin.h | 2 +- clang/lib/Headers/gpuintrin.h | 2 + clang/lib/Headers/spirvintrin.h | 182 +++++ clang/test/CodeGen/amdgpu-grid-builtins.c | 158 +++++ clang/test/CodeGen/gpu_builtins.c | 647 ++++++++++++++++++ clang/test/Headers/gpuintrin.c | 223 ++++++ llvm/include/llvm/IR/Intrinsics.td | 63 ++ llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 2 + llvm/include/llvm/InitializePasses.h | 1 + llvm/include/llvm/Transforms/Scalar.h | 6 + .../Transforms/Scalar/LowerGPUIntrinsic.h | 26 + llvm/lib/Passes/PassBuilder.cpp | 1 + llvm/lib/Passes/PassRegistry.def | 1 + llvm/lib/Transforms/Scalar/CMakeLists.txt | 1 + .../Transforms/Scalar/LowerGPUIntrinsic.cpp | 501 ++++++++++++++ llvm/lib/Transforms/Scalar/Scalar.cpp | 1 + llvm/test/CodeGen/SPIRV/gpu_intrinsics.ll | 427 ++++++++++++ 18 files changed, 2272 insertions(+), 1 deletion(-) create mode 100644 clang/lib/Headers/spirvintrin.h create mode 100644 clang/test/CodeGen/amdgpu-grid-builtins.c create mode 100644 clang/test/CodeGen/gpu_builtins.c create mode 100644 llvm/include/llvm/Transforms/Scalar/LowerGPUIntrinsic.h create mode 100644 llvm/lib/Transforms/Scalar/LowerGPUIntrinsic.cpp create mode 100644 llvm/test/CodeGen/SPIRV/gpu_intrinsics.ll diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td index 2fbdfaea57ccd..042508c1e59a8 100644 --- a/clang/include/clang/Basic/Builtins.td +++ b/clang/include/clang/Basic/Builtins.td @@ -4770,6 +4770,35 @@ def GetDeviceSideMangledName : LangBuiltin<"CUDA_LANG"> { let Prototype = "char const*(...)"; } +// GPU intrinsics +class GPUBuiltin<string prototype> : Builtin { + let Spellings = ["__builtin_" # NAME]; + let Prototype = prototype; + let Attributes = [NoThrow]; +} + +multiclass GPUGridBuiltin<string prototype> { + def _x : GPUBuiltin<prototype>; + def _y : GPUBuiltin<prototype>; + def _z : GPUBuiltin<prototype>; +} + +defm gpu_num_blocks : GPUGridBuiltin<"uint32_t()">; +defm gpu_block_id : GPUGridBuiltin<"uint32_t()">; +defm gpu_num_threads : GPUGridBuiltin<"uint32_t()">; +defm gpu_thread_id : GPUGridBuiltin<"uint32_t()">; + +def gpu_ballot : GPUBuiltin<"uint64_t(uint64_t, bool)">; +def gpu_exit : GPUBuiltin<"void()">; +def gpu_lane_id : GPUBuiltin<"uint32_t()">; +def gpu_lane_mask : GPUBuiltin<"uint64_t()">; +def gpu_num_lanes : GPUBuiltin<"uint32_t()">; +def gpu_read_first_lane_u32 : GPUBuiltin<"uint32_t(uint64_t, uint32_t)">; +def gpu_shuffle_idx_u32 : GPUBuiltin<"uint32_t(uint64_t, uint32_t, uint32_t, uint32_t)">; +def gpu_sync_lane : GPUBuiltin<"void(uint64_t)">; +def gpu_sync_threads : GPUBuiltin<"void()">; +def gpu_thread_suspend : GPUBuiltin<"void()">; + // HLSL def HLSLAddUint64: LangBuiltin<"HLSL_LANG"> { let Spellings = ["__builtin_hlsl_adduint64"]; diff --git a/clang/lib/Headers/amdgpuintrin.h b/clang/lib/Headers/amdgpuintrin.h index f7fb8e2814180..817cfeec896c4 100644 --- a/clang/lib/Headers/amdgpuintrin.h +++ b/clang/lib/Headers/amdgpuintrin.h @@ -1,4 +1,4 @@ -//===-- amdgpuintrin.h - AMDPGU intrinsic functions -----------------------===// +//===-- amdgpuintrin.h - AMDGPU intrinsic functions -----------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. diff --git a/clang/lib/Headers/gpuintrin.h b/clang/lib/Headers/gpuintrin.h index 0fb3916acac61..a3ce535188a48 100644 --- a/clang/lib/Headers/gpuintrin.h +++ b/clang/lib/Headers/gpuintrin.h @@ -60,6 +60,8 @@ _Pragma("omp end declare target"); #include <nvptxintrin.h> #elif defined(__AMDGPU__) #include <amdgpuintrin.h> +#elif defined(__SPIRV64__) +#include <spirvintrin.h> #elif !defined(_OPENMP) #error "This header is only meant to be used on GPU architectures." #endif diff --git a/clang/lib/Headers/spirvintrin.h b/clang/lib/Headers/spirvintrin.h new file mode 100644 index 0000000000000..3f40ad156d5ce --- /dev/null +++ b/clang/lib/Headers/spirvintrin.h @@ -0,0 +1,182 @@ +//===-- spirvintrin.h - SPIRV intrinsic functions ------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef __SPIRVINTRIN_H +#define __SPIRVINTRIN_H + +#ifndef __SPIRV64__ +// 32 bit SPIRV is currently a stretch goal +#error "This file is intended for SPIRV64 targets or offloading to SPIRV64" +#endif + +#ifndef __GPUINTRIN_H +#error "Never use <spirvintrin.h> directly; include <gpuintrin.h> instead" +#endif + +// This is the skeleton of the spirv implementation for gpuintrin +// Address spaces and kernel attribute are not yet implemented + +#if defined(_OPENMP) +#error "Openmp is not yet available on spirv though gpuintrin header" +#endif + +// Type aliases to the address spaces used by the SPIRV backend. +#define __gpu_private +#define __gpu_constant +#define __gpu_local +#define __gpu_global +#define __gpu_generic + +// Attribute to declare a function as a kernel. +#define __gpu_kernel + +// Note, because the builtin_gpu intrinsics lower to amdgcn or nvptx on request +// the following implementations of these functions would work equally well +// in the amdgcnintrin.h or nvptxintrin.h headers, i.e. we could move this +// definition of __gpu_num_blocks_x et al into gpuintrin.h and remove them +// from the three target intrin.h headers. + +// Returns the number of workgroups in the 'x' dimension of the grid. +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_x(void) { + return __builtin_gpu_num_blocks_x(); +} + +// Returns the number of workgroups in the 'y' dimension of the grid. +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_y(void) { + return __builtin_gpu_num_blocks_y(); +} + +// Returns the number of workgroups in the 'z' dimension of the grid. +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_z(void) { + return __builtin_gpu_num_blocks_z(); +} + +// Returns the 'x' dimension of the current AMD workgroup's id. +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_x(void) { + return __builtin_gpu_block_id_x(); +} + +// Returns the 'y' dimension of the current AMD workgroup's id. +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_y(void) { + return __builtin_gpu_block_id_y(); +} + +// Returns the 'z' dimension of the current AMD workgroup's id. +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_z(void) { + return __builtin_gpu_block_id_z(); +} + +// Returns the number of workitems in the 'x' dimension. +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_x(void) { + return __builtin_gpu_num_threads_x(); +} + +// Returns the number of workitems in the 'y' dimension. +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_y(void) { + return __builtin_gpu_num_threads_y(); +} + +// Returns the number of workitems in the 'z' dimension. +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_z(void) { + return __builtin_gpu_num_threads_z(); +} + +// Returns the 'x' dimension id of the workitem in the current workgroup. +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_x(void) { + return __builtin_gpu_thread_id_x(); +} + +// Returns the 'y' dimension id of the workitem in the current workgroup. +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_y(void) { + return __builtin_gpu_thread_id_y(); +} + +// Returns the 'z' dimension id of the workitem in the current workgroup. +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_z(void) { + return __builtin_gpu_thread_id_z(); +} + +// Returns the size of the wave. +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_lanes(void) { + return __builtin_gpu_num_lanes(); +} + +// Returns the id of the thread inside of a wave executing together. +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_lane_id(void) { + return __builtin_gpu_lane_id(); +} + +// Returns the bit-mask of active threads in the current wave. +_DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_lane_mask(void) { + return __builtin_gpu_lane_mask(); +} + +// Copies the value from the first active thread in the wave to the rest. +_DEFAULT_FN_ATTRS static __inline__ uint32_t +__gpu_read_first_lane_u32(uint64_t __lane_mask, uint32_t __x) { + return __builtin_gpu_read_first_lane_u32(__lane_mask, __x); +} + +// Returns a bitmask of threads in the current lane for which \p x is true. +_DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_ballot(uint64_t __lane_mask, + bool __x) { + return __builtin_gpu_ballot(__lane_mask, __x); +} + +// Waits for all the threads in the block to converge and issues a fence. +_DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_threads(void) { + return __builtin_gpu_sync_threads(); +} + +// Wait for all threads in the wave to converge +_DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_lane(uint64_t __lane_mask) { + return __builtin_gpu_sync_lane(__lane_mask); +} + +// Shuffles the the lanes inside the wave according to the given index. +_DEFAULT_FN_ATTRS static __inline__ uint32_t +__gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x, + uint32_t __width) { + return __builtin_gpu_shuffle_idx_u32(__lane_mask, __idx, __x, __width); +} + +// Returns a bitmask marking all lanes that have the same value of __x. +_DEFAULT_FN_ATTRS static __inline__ uint64_t +__gpu_match_any_u32(uint64_t __lane_mask, uint32_t __x) { + return __gpu_match_any_u32_impl(__lane_mask, __x); +} + +// Returns a bitmask marking all lanes that have the same value of __x. +_DEFAULT_FN_ATTRS static __inline__ uint64_t +__gpu_match_any_u64(uint64_t __lane_mask, uint64_t __x) { + return __gpu_match_any_u64_impl(__lane_mask, __x); +} + +// Returns the current lane mask if every lane contains __x. +_DEFAULT_FN_ATTRS static __inline__ uint64_t +__gpu_match_all_u32(uint64_t __lane_mask, uint32_t __x) { + return __gpu_match_all_u32_impl(__lane_mask, __x); +} + +// Returns the current lane mask if every lane contains __x. +_DEFAULT_FN_ATTRS static __inline__ uint64_t +__gpu_match_all_u64(uint64_t __lane_mask, uint64_t __x) { + return __gpu_match_all_u64_impl(__lane_mask, __x); +} + +// Terminates execution of the associated wave. +_DEFAULT_FN_ATTRS [[noreturn]] static __inline__ void __gpu_exit(void) { + return __builtin_gpu_exit(); +} + +// Suspend the thread briefly to assist the scheduler during busy loops. +_DEFAULT_FN_ATTRS static __inline__ void __gpu_thread_suspend(void) { + return __builtin_gpu_thread_suspend(); +} + +#endif // __SPIRVINTRIN_H diff --git a/clang/test/CodeGen/amdgpu-grid-builtins.c b/clang/test/CodeGen/amdgpu-grid-builtins.c new file mode 100644 index 0000000000000..2104da2dc3cbc --- /dev/null +++ b/clang/test/CodeGen/amdgpu-grid-builtins.c @@ -0,0 +1,158 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 +// REQUIRES: amdgpu-registered-target +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -emit-llvm -O1 %s -o - | FileCheck %s + +#include <stdint.h> + +// CHECK-LABEL: define dso_local noundef i32 @workgroup_id_x( +// CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.amdgcn.workgroup.id.x() +// CHECK-NEXT: ret i32 [[TMP0]] +// +uint32_t workgroup_id_x(void) +{ + return __builtin_amdgcn_workgroup_id_x(); +} + +// CHECK-LABEL: define dso_local noundef i32 @workgroup_id_y( +// CHECK-SAME: ) local_unnamed_addr #[[ATTR2:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.amdgcn.workgroup.id.y() +// CHECK-NEXT: ret i32 [[TMP0]] +// +uint32_t workgroup_id_y(void) +{ + return __builtin_amdgcn_workgroup_id_y(); +} + +// CHECK-LABEL: define dso_local noundef i32 @workgroup_id_z( +// CHECK-SAME: ) local_unnamed_addr #[[ATTR3:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.amdgcn.workgroup.id.z() +// CHECK-NEXT: ret i32 [[TMP0]] +// +uint32_t workgroup_id_z(void) +{ + return __builtin_amdgcn_workgroup_id_z(); +} + +// CHECK-LABEL: define dso_local noundef range(i32 0, 1024) i32 @workitem_id_x( +// CHECK-SAME: ) local_unnamed_addr #[[ATTR4:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x() +// CHECK-NEXT: ret i32 [[TMP0]] +// +uint32_t workitem_id_x(void) +{ + return __builtin_amdgcn_workitem_id_x(); +} + +// CHECK-LABEL: define dso_local noundef range(i32 0, 1024) i32 @workitem_id_y( +// CHECK-SAME: ) local_unnamed_addr #[[ATTR5:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.y() +// CHECK-NEXT: ret i32 [[TMP0]] +// +uint32_t workitem_id_y(void) +{ + return __builtin_amdgcn_workitem_id_y(); +} + +// CHECK-LABEL: define dso_local noundef range(i32 0, 1024) i32 @workitem_id_z( +// CHECK-SAME: ) local_unnamed_addr #[[ATTR6:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.z() +// CHECK-NEXT: ret i32 [[TMP0]] +// +uint32_t workitem_id_z(void) +{ + return __builtin_amdgcn_workitem_id_z(); +} + +// CHECK-LABEL: define dso_local range(i32 1, 1025) i32 @workgroup_size_x( +// CHECK-SAME: ) local_unnamed_addr #[[ATTR7:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +// CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[TMP0]], i64 12 +// CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr addrspace(4) [[TMP1]], align 4, !range [[RNG3:![0-9]+]], !invariant.load [[META4:![0-9]+]], !noundef [[META4]] +// CHECK-NEXT: [[CONV:%.*]] = zext nneg i16 [[TMP2]] to i32 +// CHECK-NEXT: ret i32 [[CONV]] +// +uint32_t workgroup_size_x(void) +{ + return __builtin_amdgcn_workgroup_size_x(); +} + +// CHECK-LABEL: define dso_local range(i32 1, 1025) i32 @workgroup_size_y( +// CHECK-SAME: ) local_unnamed_addr #[[ATTR7]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +// CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[TMP0]], i64 14 +// CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr addrspace(4) [[TMP1]], align 2, !range [[RNG3]], !invariant.load [[META4]], !noundef [[META4]] +// CHECK-NEXT: [[CONV:%.*]] = zext nneg i16 [[TMP2]] to i32 +// CHECK-NEXT: ret i32 [[CONV]] +// +uint32_t workgroup_size_y(void) +{ + return __builtin_amdgcn_workgroup_size_y(); +} + +// CHECK-LABEL: define dso_local range(i32 1, 1025) i32 @workgroup_size_z( +// CHECK-SAME: ) local_unnamed_addr #[[ATTR7]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +// CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[TMP0]], i64 16 +// CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr addrspace(4) [[TMP1]], align 8, !range [[RNG3]], !invariant.load [[META4]], !noundef [[META4]] +// CHECK-NEXT: [[CONV:%.*]] = zext nneg i16 [[TMP2]] to i32 +// CHECK-NEXT: ret i32 [[CONV]] +// +uint32_t workgroup_size_z(void) +{ + return __builtin_amdgcn_workgroup_size_z(); +} + +// CHECK-LABEL: define dso_local range(i32 1, 0) i32 @grid_size_x( +// CHECK-SAME: ) local_unnamed_addr #[[ATTR8:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() +// CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[TMP0]], i64 12 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TMP1]], align 4, !range [[RNG5:![0-9]+]], !invariant.load [[META4]] +// CHECK-NEXT: ret i32 [[TMP2]] +// +uint32_t grid_size_x(void) +{ + return __builtin_amdgcn_grid_size_x(); +} + +// CHECK-LABEL: define dso_local range(i32 1, 0) i32 @grid_size_y( +// CHECK-SAME: ) local_unnamed_addr #[[ATTR8]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() +// CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[TMP0]], i64 16 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TMP1]], align 4, !range [[RNG5]], !invariant.load [[META4]] +// CHECK-NEXT: ret i32 [[TMP2]] +// +uint32_t grid_size_y(void) +{ + return __builtin_amdgcn_grid_size_y(); +} + +// CHECK-LABEL: define dso_local range(i32 1, 0) i32 @grid_size_z( +// CHECK-SAME: ) local_unnamed_addr #[[ATTR8]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() +// CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[TMP0]], i64 20 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TMP1]], align 4, !range [[RNG5]], !invariant.load [[META4]] +// CHECK-NEXT: ret i32 [[TMP2]] +// +uint32_t grid_size_z(void) +{ + return __builtin_amdgcn_grid_size_z(); +} + +//. +// CHECK: [[RNG3]] = !{i16 1, i16 1025} +// CHECK: [[META4]] = !{} +// CHECK: [[RNG5]] = !{i32 1, i32 0} +//. diff --git a/clang/test/CodeGen/gpu_builtins.c b/clang/test/CodeGen/gpu_builtins.c new file mode 100644 index 0000000000000..8231b0952fa5e --- /dev/null +++ b/clang/test/CodeGen/gpu_builtins.c @@ -0,0 +1,647 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-attributes +// RUN: %clang_cc1 -O1 -triple spirv64 %s -emit-llvm -o - | FileCheck %s --check-prefix=SPIRV64 +// RUN: %clang_cc1 -O1 -triple spirv64-amd-amdhsa %s -emit-llvm -o - | FileCheck %s --check-prefix=AMDHSA +// RUN: %clang_cc1 -O1 -triple nvptx64 -emit-llvm %s -o - | FileCheck %s --check-prefix=NVPTX +// RUN: %clang_cc1 -O1 -triple amdgcn -emit-llvm %s -o - | FileCheck %s --check-prefix=AMDGCN + +#include <stdint.h> + +// SPIRV64: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +// SPIRV64-LABEL: @num_blocks_x( +// SPIRV64-NEXT: entry: +// SPIRV64-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.gpu.num.blocks.x() +// SPIRV64-NEXT: ret i32 [[TMP0]] +// +// AMDHSA: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +// AMDHSA-LABEL: @num_blocks_x( +// AMDHSA-NEXT: entry: +// AMDHSA-NEXT: [[TMP0:%.*]] = tail call addrspace(4) i32 @llvm.gpu.num.blocks.x() +// AMDHSA-NEXT: ret i32 [[TMP0]] +// +// NVPTX: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +// NVPTX-LABEL: @num_blocks_x( +// NVPTX-NEXT: entry: +// NVPTX-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.gpu.num.blocks.x() +// NVPTX-NEXT: ret i32 [[TMP0]] +// +// AMDGCN: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +// AMDGCN-LABEL: @num_blocks_x( +// AMDGCN-NEXT: entry: +// AMDGCN-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.gpu.num.blocks.x() +// AMDGCN-NEXT: ret i32 [[TMP0]] +// +uint32_t num_blocks_x(void) { + return __builtin_gpu_num_blocks_x(); +} + + +// SPIRV64: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +// SPIRV64-LABEL: @num_blocks_y( +// SPIRV64-NEXT: entry: +// SPIRV64-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.gpu.num.blocks.y() +// SPIRV64-NEXT: ret i32 [[TMP0]] +// +// AMDHSA: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +// AMDHSA-LABEL: @num_blocks_y( +// AMDHSA-NEXT: entry: +// AMDHSA-NEXT: [[TMP0:%.*]] = tail call addrspace(4) i32 @llvm.gpu.num.blocks.y() +// AMDHSA-NEXT: ret i32 [[TMP0]] +// +// NVPTX: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +// NVPTX-LABEL: @num_blocks_y( +// NVPTX-NEXT: entry: +// NVPTX-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.gpu.num.blocks.y() +// NVPTX-NEXT: ret i32 [[TMP0]] +// +// AMDGCN: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +// AMDGCN-LABEL: @num_blocks_y( +// AMDGCN-NEXT: entry: +// AMDGCN-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.gpu.num.blocks.y() +// AMDGCN-NEXT: ret i32 [[TMP0]] +// +uint32_t num_blocks_y(void) { + return __builtin_gpu_num_blocks_y(); +} + + +// SPIRV64: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +// SPIRV64-LABEL: @num_blocks_z( +// SPIRV64-NEXT: entry: +// SPIRV64-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.gpu.num.blocks.z() +// SPIRV64-NEXT: ret i32 [[TMP0]] +// +// AMDHSA: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +// AMDHSA-LABEL: @num_blocks_z( +// AMDHSA-NEXT: entry: +// AMDHSA-NEXT: [[TMP0:%.*]] = tail call addrspace(4) i32 @llvm.gpu.num.blocks.z() +// AMDHSA-NEXT: ret i32 [[TMP0]] +// +// NVPTX: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +// NVPTX-LABEL: @num_blocks_z( +// NVPTX-NEXT: entry: +// NVPTX-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.gpu.num.blocks.z() +// NVPTX-NEXT: ret i32 [[TMP0]] +// +// AMDGCN: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +// AMDGCN-LABEL: @num_blocks_z( +// AMDGCN-NEXT: entry: +// AMDGCN-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.gpu.num.blocks.z() +// AMDGCN-NEXT: ret i32 [[TMP0]] +// +uint32_t num_blocks_z(void) { + return __builtin_gpu_num_blocks_z(); +} + + +// SPIRV64: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +// SPIRV64-LABEL: @block_id_x( +// SPIRV64-NEXT: entry: +// SPIRV64-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.gpu.block.id.x() +// SPIRV64-NEXT: ret i32 [[TMP0]] +// +// AMDHSA: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +// AMDHSA-LABEL: @block_id_x( +// AMDHSA-NEXT: entry: +// AMDHSA-NEXT: [[TMP0:%.*]] = tail call addrspace(4) i32 @llvm.gpu.block.id.x() +// AMDHSA-NEXT: ret i32 [[TMP0]] +// +// NVPTX: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +// NVPTX-LABEL: @block_id_x( +// NVPTX-NEXT: entry: +// NVPTX-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.gpu.block.id.x() +// NVPTX-NEXT: ret i32 [[TMP0]] +// +// AMDGCN: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +// AMDGCN-LABEL: @block_id_x( +// AMDGCN-NEXT: entry: +// AMDGCN-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.gpu.block.id.x() +// AMDGCN-NEXT: ret i32 [[TMP0]] +// +uint32_t block_id_x(void) { + return __builtin_gpu_block_id_x(); +} + + +// SPIRV64: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +// SPIRV64-LABEL: @block_id_y( +// SPIRV64-NEXT: entry: +// SPIRV64-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.gpu.block.id.y() +// SPIRV64-NEXT: ret i32 [[TMP0]] +// +// AMDHSA: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +// AMDHSA-LABEL: @block_id_y( +// AMDHSA-NEXT: entry: +// AMDHSA-NEXT: [[TMP0:%.*]] = tail call addrspace(4) i32 @llvm.gpu.block.id.y() +// AMDHSA-NEXT: ret i32 [[TMP0]] +// +// NVPTX: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +// NVPTX-LABEL: @block_id_y( +// NVPTX-NEXT: entry: +// NVPTX-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.gpu.block.id.y() +// NVPTX-NEXT: ret i32 [[TMP0]] +// +// AMDGCN: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +// AMDGCN-LABEL: @block_id_y( +// AMDGCN-NEXT: entry: +// AMDGCN-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.gpu.block.id.y() +// AMDGCN-NEXT: ret i32 [[TMP0]] +// +uint32_t block_id_y(void) { + return __builtin_gpu_block_id_y(); +} + + +// SPIRV64: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +// SPIRV64-LABEL: @block_id_z( +// SPIRV64-NEXT: entry: +// SPIRV64-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.gpu.block.id.z() +// SPIRV64-NEXT: ret i32 [[TMP0]] +// +// AMDHSA: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +// AMDHSA-LABEL: @block_id_z( +// AMDHSA-NEXT: entry: +// AMDHSA-NEXT: [[TMP0:%.*]] = tail call addrspace(4) i32 @llvm.gpu.block.id.z() +// AMDHSA-NEXT: ret i32 [[TMP0]] +// +// NVPTX: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +// NVPTX-LABEL: @block_id_z( +// NVPTX-NEXT: entry: +// NVPTX-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.gpu.block.id.z() +// NVPTX-NEXT: ret i32 [[TMP0]] +// +// AMDGCN: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +// AMDGCN-LABEL: @block_id_z( +// AMDGCN-NEXT: entry: +// AMDGCN-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.gpu.block.id.z() +// AMDGCN-NEXT: ret i32 [[TMP0]] +// +uint32_t block_id_z(void) { + return __builtin_gpu_block_id_z(); +} + + +// SPIRV64: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +// SPIRV64-LABEL: @num_threads_x( +// SPIRV64-NEXT: entry: +// SPIRV64-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.gpu.num.threads.x() +// SPIRV64-NEXT: ret i32 [[TMP0]] +// +// AMDHSA: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +// AMDHSA-LABEL: @num_threads_x( +// AMDHSA-NEXT: entry: +// AMDHSA-NEXT: [[TMP0:%.*]] = tail call addrspace(4) i32 @llvm.gpu.num.threads.x() +// AMDHSA-NEXT: ret i32 [[TMP0]] +// +// NVPTX: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +// NVPTX-LABEL: @num_threads_x( +// NVPTX-NEXT: entry: +// NVPTX-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.gpu.num.threads.x() +// NVPTX-NEXT: ret i32 [[TMP0]] +// +// AMDGCN: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +// AMDGCN-LABEL: @num_threads_x( +// AMDGCN-NEXT: entry: +// AMDGCN-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.gpu.num.threads.x() +// AMDGCN-NEXT: ret i32 [[TMP0]] +// +uint32_t num_threads_x(void) { + return __builtin_gpu_num_threads_x(); +} + + +// SPIRV64: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +// SPIRV64-LABEL: @num_threads_y( +// SPIRV64-NEXT: entry: +// SPIRV64-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.gpu.num.threads.y() +// SPIRV64-NEXT: ret i32 [[TMP0]] +// +// AMDHSA: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +// AMDHSA-LABEL: @num_threads_y( +// AMDHSA-NEXT: entry: +// AMDHSA-NEXT: [[TMP0:%.*]] = tail call addrspace(4) i32 @llvm.gpu.num.threads.y() +// AMDHSA-NEXT: ret i32 [[TMP0]] +// +// NVPTX: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +// NVPTX-LABEL: @num_threads_y( +// NVPTX-NEXT: entry: +// NVPTX-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.gpu.num.threads.y() +// NVPTX-NEXT: ret i32 [[TMP0]] +// +// AMDGCN: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +// AMDGCN-LABEL: @num_threads_y( +// AMDGCN-NEXT: entry: +// AMDGCN-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.gpu.num.threads.y() +// AMDGCN-NEXT: ret i32 [[TMP0]] +// +uint32_t num_threads_y(void) { + return __builtin_gpu_num_threads_y(); +} + + +// SPIRV64: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +// SPIRV64-LABEL: @num_threads_z( +// SPIRV64-NEXT: entry: +// SPIRV64-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.gpu.num.threads.z() +// SPIRV64-NEXT: ret i32 [[TMP0]] +// +// AMDHSA: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +// AMDHSA-LABEL: @num_threads_z( +// AMDHSA-NEXT: entry: +// AMDHSA-NEXT: [[TMP0:%.*]] = tail call addrspace(4) i32 @llvm.gpu.num.threads.z() +// AMDHSA-NEXT: ret i32 [[TMP0]] +// +// NVPTX: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +// NVPTX-LABEL: @num_threads_z( +// NVPTX-NEXT: entry: +// NVPTX-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.gpu.num.threads.z() +// NVPTX-NEXT: ret i32 [[TMP0]] +// +// AMDGCN: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +// AMDGCN-LABEL: @num_threads_z( +// AMDGCN-NEXT: entry: +// AMDGCN-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.gpu.num.threads.z() +// AMDGCN-NEXT: ret i32 [[TMP0]] +// +uint32_t num_threads_z(void) { + return __builtin_gpu_num_threads_z(); +} + + +// SPIRV64: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +// SPIRV64-LABEL: @thread_id_x( +// SPIRV64-NEXT: entry: +// SPIRV64-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.gpu.thread.id.x() +// SPIRV64-NEXT: ret i32 [[TMP0]] +// +// AMDHSA: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +// AMDHSA-LABEL: @thread_id_x( +// AMDHSA-NEXT: entry: +// AMDHSA-NEXT: [[TMP0:%.*]] = tail call addrspace(4) i32 @llvm.gpu.thread.id.x() +// AMDHSA-NEXT: ret i32 [[TMP0]] +// +// NVPTX: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +// NVPTX-LABEL: @thread_id_x( +// NVPTX-NEXT: entry: +// NVPTX-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.gpu.thread.id.x() +// NVPTX-NEXT: ret i32 [[TMP0]] +// +// AMDGCN: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +// AMDGCN-LABEL: @thread_id_x( +// AMDGCN-NEXT: entry: +// AMDGCN-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.gpu.thread.id.x() +// AMDGCN-NEXT: ret i32 [[TMP0]] +// +uint32_t thread_id_x(void) { + return __builtin_gpu_thread_id_x(); +} + + +// SPIRV64: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +// SPIRV64-LABEL: @thread_id_y( +// SPIRV64-NEXT: entry: +// SPIRV64-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.gpu.thread.id.y() +// SPIRV64-NEXT: ret i32 [[TMP0]] +// +// AMDHSA: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +// AMDHSA-LABEL: @thread_id_y( +// AMDHSA-NEXT: entry: +// AMDHSA-NEXT: [[TMP0:%.*]] = tail call addrspace(4) i32 @llvm.gpu.thread.id.y() +// AMDHSA-NEXT: ret i32 [[TMP0]] +// +// NVPTX: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +// NVPTX-LABEL: @thread_id_y( +// NVPTX-NEXT: entry: +// NVPTX-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.gpu.thread.id.y() +// NVPTX-NEXT: ret i32 [[TMP0]] +// +// AMDGCN: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +// AMDGCN-LABEL: @thread_id_y( +// AMDGCN-NEXT: entry: +// AMDGCN-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.gpu.thread.id.y() +// AMDGCN-NEXT: ret i32 [[TMP0]] +// +uint32_t thread_id_y(void) { + return __builtin_gpu_thread_id_y(); +} + + +// SPIRV64: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +// SPIRV64-LABEL: @thread_id_z( +// SPIRV64-NEXT: entry: +// SPIRV64-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.gpu.thread.id.z() +// SPIRV64-NEXT: ret i32 [[TMP0]] +// +// AMDHSA: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +// AMDHSA-LABEL: @thread_id_z( +// AMDHSA-NEXT: entry: +// AMDHSA-NEXT: [[TMP0:%.*]] = tail call addrspace(4) i32 @llvm.gpu.thread.id.z() +// AMDHSA-NEXT: ret i32 [[TMP0]] +// +// NVPTX: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +// NVPTX-LABEL: @thread_id_z( +// NVPTX-NEXT: entry: +// NVPTX-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.gpu.thread.id.z() +// NVPTX-NEXT: ret i32 [[TMP0]] +// +// AMDGCN: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +// AMDGCN-LABEL: @thread_id_z( +// AMDGCN-NEXT: entry: +// AMDGCN-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.gpu.thread.id.z() +// AMDGCN-NEXT: ret i32 [[TMP0]] +// +uint32_t thread_id_z(void) { + return __builtin_gpu_thread_id_z(); +} + + +// SPIRV64: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +// SPIRV64-LABEL: @num_lanes( +// SPIRV64-NEXT: entry: +// SPIRV64-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.gpu.num.lanes() +// SPIRV64-NEXT: ret i32 [[TMP0]] +// +// AMDHSA: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +// AMDHSA-LABEL: @num_lanes( +// AMDHSA-NEXT: entry: +// AMDHSA-NEXT: [[TMP0:%.*]] = tail call addrspace(4) i32 @llvm.gpu.num.lanes() +// AMDHSA-NEXT: ret i32 [[TMP0]] +// +// NVPTX: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +// NVPTX-LABEL: @num_lanes( +// NVPTX-NEXT: entry: +// NVPTX-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.gpu.num.lanes() +// NVPTX-NEXT: ret i32 [[TMP0]] +// +// AMDGCN: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +// AMDGCN-LABEL: @num_lanes( +// AMDGCN-NEXT: entry: +// AMDGCN-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.gpu.num.lanes() +// AMDGCN-NEXT: ret i32 [[TMP0]] +// +uint32_t num_lanes(void) { + return __builtin_gpu_num_lanes(); +} + + +// SPIRV64: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +// SPIRV64-LABEL: @lane_id( +// SPIRV64-NEXT: entry: +// SPIRV64-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.gpu.lane.id() +// SPIRV64-NEXT: ret i32 [[TMP0]] +// +// AMDHSA: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +// AMDHSA-LABEL: @lane_id( +// AMDHSA-NEXT: entry: +// AMDHSA-NEXT: [[TMP0:%.*]] = tail call addrspace(4) i32 @llvm.gpu.lane.id() +// AMDHSA-NEXT: ret i32 [[TMP0]] +// +// NVPTX: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +// NVPTX-LABEL: @lane_id( +// NVPTX-NEXT: entry: +// NVPTX-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.gpu.lane.id() +// NVPTX-NEXT: ret i32 [[TMP0]] +// +// AMDGCN: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +// AMDGCN-LABEL: @lane_id( +// AMDGCN-NEXT: entry: +// AMDGCN-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.gpu.lane.id() +// AMDGCN-NEXT: ret i32 [[TMP0]] +// +uint32_t lane_id(void) { + return __builtin_gpu_lane_id(); +} + + +// SPIRV64: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +// SPIRV64-LABEL: @lane_mask( +// SPIRV64-NEXT: entry: +// SPIRV64-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.gpu.lane.mask() +// SPIRV64-NEXT: ret i64 [[TMP0]] +// +// AMDHSA: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +// AMDHSA-LABEL: @lane_mask( +// AMDHSA-NEXT: entry: +// AMDHSA-NEXT: [[TMP0:%.*]] = tail call addrspace(4) i64 @llvm.gpu.lane.mask() +// AMDHSA-NEXT: ret i64 [[TMP0]] +// +// NVPTX: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +// NVPTX-LABEL: @lane_mask( +// NVPTX-NEXT: entry: +// NVPTX-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.gpu.lane.mask() +// NVPTX-NEXT: ret i64 [[TMP0]] +// +// AMDGCN: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +// AMDGCN-LABEL: @lane_mask( +// AMDGCN-NEXT: entry: +// AMDGCN-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.gpu.lane.mask() +// AMDGCN-NEXT: ret i64 [[TMP0]] +// +uint64_t lane_mask(void) { + return __builtin_gpu_lane_mask(); +} + + +uint32_t +// SPIRV64: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +// SPIRV64-LABEL: @read_first_lane_u32( +// SPIRV64-NEXT: entry: +// SPIRV64-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.gpu.read.first.lane.u32(i64 [[__LANE_MASK:%.*]], i32 [[__X:%.*]]) +// SPIRV64-NEXT: ret i32 [[TMP0]] +// +// AMDHSA: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +// AMDHSA-LABEL: @read_first_lane_u32( +// AMDHSA-NEXT: entry: +// AMDHSA-NEXT: [[TMP0:%.*]] = tail call addrspace(4) i32 @llvm.gpu.read.first.lane.u32(i64 [[__LANE_MASK:%.*]], i32 [[__X:%.*]]) +// AMDHSA-NEXT: ret i32 [[TMP0]] +// +// NVPTX: Function Attrs: convergent mustprogress nofree norecurse nosync nounwind willreturn memory(none) +// NVPTX-LABEL: @read_first_lane_u32( +// NVPTX-NEXT: entry: +// NVPTX-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.gpu.read.first.lane.u32(i64 [[__LANE_MASK:%.*]], i32 [[__X:%.*]]) +// NVPTX-NEXT: ret i32 [[TMP0]] +// +// AMDGCN: Function Attrs: convergent mustprogress nofree norecurse nosync nounwind willreturn memory(none) +// AMDGCN-LABEL: @read_first_lane_u32( +// AMDGCN-NEXT: entry: +// AMDGCN-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.gpu.read.first.lane.u32(i64 [[__LANE_MASK:%.*]], i32 [[__X:%.*]]) +// AMDGCN-NEXT: ret i32 [[TMP0]] +// +read_first_lane_u32(uint64_t __lane_mask, uint32_t __x) { + return __builtin_gpu_read_first_lane_u32(__lane_mask, __x); +} + + +// SPIRV64: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +// SPIRV64-LABEL: @ballot( +// SPIRV64-NEXT: entry: +// SPIRV64-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.gpu.ballot(i64 [[__LANE_MASK:%.*]], i1 [[__X:%.*]]) +// SPIRV64-NEXT: ret i64 [[TMP0]] +// +// AMDHSA: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +// AMDHSA-LABEL: @ballot( +// AMDHSA-NEXT: entry: +// AMDHSA-NEXT: [[TMP0:%.*]] = tail call addrspace(4) i64 @llvm.gpu.ballot(i64 [[__LANE_MASK:%.*]], i1 [[__X:%.*]]) +// AMDHSA-NEXT: ret i64 [[TMP0]] +// +// NVPTX: Function Attrs: convergent mustprogress nofree norecurse nosync nounwind willreturn memory(none) +// NVPTX-LABEL: @ballot( +// NVPTX-NEXT: entry: +// NVPTX-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.gpu.ballot(i64 [[__LANE_MASK:%.*]], i1 [[__X:%.*]]) +// NVPTX-NEXT: ret i64 [[TMP0]] +// +// AMDGCN: Function Attrs: convergent mustprogress nofree norecurse nosync nounwind willreturn memory(none) +// AMDGCN-LABEL: @ballot( +// AMDGCN-NEXT: entry: +// AMDGCN-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.gpu.ballot(i64 [[__LANE_MASK:%.*]], i1 [[__X:%.*]]) +// AMDGCN-NEXT: ret i64 [[TMP0]] +// +uint64_t ballot(uint64_t __lane_mask, + _Bool __x) { + return __builtin_gpu_ballot(__lane_mask, __x); +} + + +// SPIRV64: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn +// SPIRV64-LABEL: @sync_threads( +// SPIRV64-NEXT: entry: +// SPIRV64-NEXT: tail call void @llvm.gpu.sync.threads() +// SPIRV64-NEXT: ret void +// +// AMDHSA: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn +// AMDHSA-LABEL: @sync_threads( +// AMDHSA-NEXT: entry: +// AMDHSA-NEXT: tail call addrspace(4) void @llvm.gpu.sync.threads() +// AMDHSA-NEXT: ret void +// +// NVPTX: Function Attrs: convergent mustprogress nofree norecurse nosync nounwind willreturn +// NVPTX-LABEL: @sync_threads( +// NVPTX-NEXT: entry: +// NVPTX-NEXT: tail call void @llvm.gpu.sync.threads() +// NVPTX-NEXT: ret void +// +// AMDGCN: Function Attrs: convergent mustprogress nofree norecurse nosync nounwind willreturn +// AMDGCN-LABEL: @sync_threads( +// AMDGCN-NEXT: entry: +// AMDGCN-NEXT: tail call void @llvm.gpu.sync.threads() +// AMDGCN-NEXT: ret void +// +void sync_threads(void) { + return __builtin_gpu_sync_threads(); +} + + +// SPIRV64: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn +// SPIRV64-LABEL: @sync_lane( +// SPIRV64-NEXT: entry: +// SPIRV64-NEXT: tail call void @llvm.gpu.sync.lane(i64 [[__LANE_MASK:%.*]]) +// SPIRV64-NEXT: ret void +// +// AMDHSA: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn +// AMDHSA-LABEL: @sync_lane( +// AMDHSA-NEXT: entry: +// AMDHSA-NEXT: tail call addrspace(4) void @llvm.gpu.sync.lane(i64 [[__LANE_MASK:%.*]]) +// AMDHSA-NEXT: ret void +// +// NVPTX: Function Attrs: convergent mustprogress nofree norecurse nosync nounwind willreturn +// NVPTX-LABEL: @sync_lane( +// NVPTX-NEXT: entry: +// NVPTX-NEXT: tail call void @llvm.gpu.sync.lane(i64 [[__LANE_MASK:%.*]]) +// NVPTX-NEXT: ret void +// +// AMDGCN: Function Attrs: convergent mustprogress nofree norecurse nosync nounwind willreturn +// AMDGCN-LABEL: @sync_lane( +// AMDGCN-NEXT: entry: +// AMDGCN-NEXT: tail call void @llvm.gpu.sync.lane(i64 [[__LANE_MASK:%.*]]) +// AMDGCN-NEXT: ret void +// +void sync_lane(uint64_t __lane_mask) { + return __builtin_gpu_sync_lane(__lane_mask); +} + + +uint32_t +// SPIRV64: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +// SPIRV64-LABEL: @shuffle_idx_u32( +// SPIRV64-NEXT: entry: +// SPIRV64-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.gpu.shuffle.idx.u32(i64 [[__LANE_MASK:%.*]], i32 [[__IDX:%.*]], i32 [[__X:%.*]], i32 [[__WIDTH:%.*]]) +// SPIRV64-NEXT: ret i32 [[TMP0]] +// +// AMDHSA: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +// AMDHSA-LABEL: @shuffle_idx_u32( +// AMDHSA-NEXT: entry: +// AMDHSA-NEXT: [[TMP0:%.*]] = tail call addrspace(4) i32 @llvm.gpu.shuffle.idx.u32(i64 [[__LANE_MASK:%.*]], i32 [[__IDX:%.*]], i32 [[__X:%.*]], i32 [[__WIDTH:%.*]]) +// AMDHSA-NEXT: ret i32 [[TMP0]] +// +// NVPTX: Function Attrs: convergent mustprogress nofree norecurse nosync nounwind willreturn memory(none) +// NVPTX-LABEL: @shuffle_idx_u32( +// NVPTX-NEXT: entry: +// NVPTX-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.gpu.shuffle.idx.u32(i64 [[__LANE_MASK:%.*]], i32 [[__IDX:%.*]], i32 [[__X:%.*]], i32 [[__WIDTH:%.*]]) +// NVPTX-NEXT: ret i32 [[TMP0]] +// +// AMDGCN: Function Attrs: convergent mustprogress nofree norecurse nosync nounwind willreturn memory(none) +// AMDGCN-LABEL: @shuffle_idx_u32( +// AMDGCN-NEXT: entry: +// AMDGCN-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.gpu.shuffle.idx.u32(i64 [[__LANE_MASK:%.*]], i32 [[__IDX:%.*]], i32 [[__X:%.*]], i32 [[__WIDTH:%.*]]) +// AMDGCN-NEXT: ret i32 [[TMP0]] +// +shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x, + uint32_t __width) { + return __builtin_gpu_shuffle_idx_u32(__lane_mask, __idx, __x, __width); +} + +// SPIRV64: Function Attrs: nofree norecurse noreturn nosync nounwind +// SPIRV64-LABEL: @gpu_exit( +// SPIRV64-NEXT: entry: +// SPIRV64-NEXT: tail call void @llvm.gpu.exit() +// SPIRV64-NEXT: unreachable +// +// AMDHSA: Function Attrs: nofree norecurse noreturn nosync nounwind +// AMDHSA-LABEL: @gpu_exit( +// AMDHSA-NEXT: entry: +// AMDHSA-NEXT: tail call addrspace(4) void @llvm.gpu.exit() +// AMDHSA-NEXT: unreachable +// +// NVPTX: Function Attrs: convergent nofree norecurse noreturn nosync nounwind +// NVPTX-LABEL: @gpu_exit( +// NVPTX-NEXT: entry: +// NVPTX-NEXT: tail call void @llvm.gpu.exit() +// NVPTX-NEXT: unreachable +// +// AMDGCN: Function Attrs: convergent nofree norecurse noreturn nosync nounwind +// AMDGCN-LABEL: @gpu_exit( +// AMDGCN-NEXT: entry: +// AMDGCN-NEXT: tail call void @llvm.gpu.exit() +// AMDGCN-NEXT: unreachable +// +void gpu_exit(void) { + return __builtin_gpu_exit(); +} + + +// SPIRV64: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn +// SPIRV64-LABEL: @thread_suspend( +// SPIRV64-NEXT: entry: +// SPIRV64-NEXT: tail call void @llvm.gpu.thread.suspend() +// SPIRV64-NEXT: ret void +// +// AMDHSA: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn +// AMDHSA-LABEL: @thread_suspend( +// AMDHSA-NEXT: entry: +// AMDHSA-NEXT: tail call addrspace(4) void @llvm.gpu.thread.suspend() +// AMDHSA-NEXT: ret void +// +// NVPTX: Function Attrs: convergent mustprogress nofree norecurse nosync nounwind willreturn +// NVPTX-LABEL: @thread_suspend( +// NVPTX-NEXT: entry: +// NVPTX-NEXT: tail call void @llvm.gpu.thread.suspend() +// NVPTX-NEXT: ret void +// +// AMDGCN: Function Attrs: convergent mustprogress nofree norecurse nosync nounwind willreturn +// AMDGCN-LABEL: @thread_suspend( +// AMDGCN-NEXT: entry: +// AMDGCN-NEXT: tail call void @llvm.gpu.thread.suspend() +// AMDGCN-NEXT: ret void +// +void thread_suspend(void) { + return __builtin_gpu_thread_suspend(); +} diff --git a/clang/test/Headers/gpuintrin.c b/clang/test/Headers/gpuintrin.c index 9a15ce277ba87..eaf001be19ac9 100644 --- a/clang/test/Headers/gpuintrin.c +++ b/clang/test/Headers/gpuintrin.c @@ -9,6 +9,11 @@ // RUN: -target-feature +ptx62 \ // RUN: -triple nvptx64-nvidia-cuda -emit-llvm %s -o - \ // RUN: | FileCheck %s --check-prefix=NVPTX +// +// RUN: %clang_cc1 -internal-isystem %S/Inputs/include \ +// RUN: -internal-isystem %S/../../lib/Headers/ \ +// RUN: -triple spirv64-- -emit-llvm %s -o - \ +// RUN: | FileCheck %s --check-prefix=SPIRV64 #include <gpuintrin.h> @@ -978,6 +983,224 @@ __gpu_kernel void foo() { // NVPTX-NEXT: call void @llvm.nvvm.exit() // NVPTX-NEXT: ret void // +// +// SPIRV64-LABEL: define spir_func void @foo( +// SPIRV64-SAME: ) #[[ATTR0:[0-9]+]] { +// SPIRV64-NEXT: [[ENTRY:.*:]] +// SPIRV64-NEXT: [[CALL:%.*]] = call spir_func i32 @__gpu_num_blocks_x() +// SPIRV64-NEXT: [[CALL1:%.*]] = call spir_func i32 @__gpu_num_blocks_y() +// SPIRV64-NEXT: [[CALL2:%.*]] = call spir_func i32 @__gpu_num_blocks_z() +// SPIRV64-NEXT: [[CALL3:%.*]] = call spir_func i32 @__gpu_num_blocks(i32 noundef 0) +// SPIRV64-NEXT: [[CALL4:%.*]] = call spir_func i32 @__gpu_block_id_x() +// SPIRV64-NEXT: [[CALL5:%.*]] = call spir_func i32 @__gpu_block_id_y() +// SPIRV64-NEXT: [[CALL6:%.*]] = call spir_func i32 @__gpu_block_id_z() +// SPIRV64-NEXT: [[CALL7:%.*]] = call spir_func i32 @__gpu_block_id(i32 noundef 0) +// SPIRV64-NEXT: [[CALL8:%.*]] = call spir_func i32 @__gpu_num_threads_x() +// SPIRV64-NEXT: [[CALL9:%.*]] = call spir_func i32 @__gpu_num_threads_y() +// SPIRV64-NEXT: [[CALL10:%.*]] = call spir_func i32 @__gpu_num_threads_z() +// SPIRV64-NEXT: [[CALL11:%.*]] = call spir_func i32 @__gpu_num_threads(i32 noundef 0) +// SPIRV64-NEXT: [[CALL12:%.*]] = call spir_func i32 @__gpu_thread_id_x() +// SPIRV64-NEXT: [[CALL13:%.*]] = call spir_func i32 @__gpu_thread_id_y() +// SPIRV64-NEXT: [[CALL14:%.*]] = call spir_func i32 @__gpu_thread_id_z() +// SPIRV64-NEXT: [[CALL15:%.*]] = call spir_func i32 @__gpu_thread_id(i32 noundef 0) +// SPIRV64-NEXT: [[CALL16:%.*]] = call spir_func i32 @__gpu_num_lanes() +// SPIRV64-NEXT: [[CALL17:%.*]] = call spir_func i32 @__gpu_lane_id() +// SPIRV64-NEXT: [[CALL18:%.*]] = call spir_func i64 @__gpu_lane_mask() +// SPIRV64-NEXT: [[CALL19:%.*]] = call spir_func i32 @__gpu_read_first_lane_u32(i64 noundef -1, i32 noundef -1) +// SPIRV64-NEXT: [[CALL20:%.*]] = call spir_func i64 @__gpu_read_first_lane_u64(i64 noundef -1, i64 noundef -1) +// SPIRV64-NEXT: [[CALL21:%.*]] = call spir_func i64 @__gpu_ballot(i64 noundef -1, i1 noundef zeroext true) +// SPIRV64-NEXT: call spir_func void @__gpu_sync_threads() +// SPIRV64-NEXT: call spir_func void @__gpu_sync_lane(i64 noundef -1) +// SPIRV64-NEXT: [[CALL22:%.*]] = call spir_func i32 @__gpu_shuffle_idx_u32(i64 noundef -1, i32 noundef -1, i32 noundef -1, i32 noundef 0) +// SPIRV64-NEXT: [[CALL23:%.*]] = call spir_func i64 @__gpu_first_lane_id(i64 noundef -1) +// SPIRV64-NEXT: [[CALL24:%.*]] = call spir_func zeroext i1 @__gpu_is_first_in_lane(i64 noundef -1) +// SPIRV64-NEXT: call spir_func void @__gpu_exit() #[[ATTR4:[0-9]+]] +// SPIRV64-NEXT: unreachable +// +// +// SPIRV64-LABEL: define internal spir_func i32 @__gpu_num_blocks( +// SPIRV64-SAME: i32 noundef [[__DIM:%.*]]) #[[ATTR0]] { +// SPIRV64-NEXT: [[ENTRY:.*:]] +// SPIRV64-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 +// SPIRV64-NEXT: [[__DIM_ADDR:%.*]] = alloca i32, align 4 +// SPIRV64-NEXT: store i32 [[__DIM]], ptr [[__DIM_ADDR]], align 4 +// SPIRV64-NEXT: [[TMP0:%.*]] = load i32, ptr [[__DIM_ADDR]], align 4 +// SPIRV64-NEXT: switch i32 [[TMP0]], label %[[SW_DEFAULT:.*]] [ +// SPIRV64-NEXT: i32 0, label %[[SW_BB:.*]] +// SPIRV64-NEXT: i32 1, label %[[SW_BB1:.*]] +// SPIRV64-NEXT: i32 2, label %[[SW_BB3:.*]] +// SPIRV64-NEXT: ] +// SPIRV64: [[SW_BB]]: +// SPIRV64-NEXT: [[CALL:%.*]] = call spir_func i32 @__gpu_num_blocks_x() +// SPIRV64-NEXT: store i32 [[CALL]], ptr [[RETVAL]], align 4 +// SPIRV64-NEXT: br label %[[RETURN:.*]] +// SPIRV64: [[SW_BB1]]: +// SPIRV64-NEXT: [[CALL2:%.*]] = call spir_func i32 @__gpu_num_blocks_y() +// SPIRV64-NEXT: store i32 [[CALL2]], ptr [[RETVAL]], align 4 +// SPIRV64-NEXT: br label %[[RETURN]] +// SPIRV64: [[SW_BB3]]: +// SPIRV64-NEXT: [[CALL4:%.*]] = call spir_func i32 @__gpu_num_blocks_z() +// SPIRV64-NEXT: store i32 [[CALL4]], ptr [[RETVAL]], align 4 +// SPIRV64-NEXT: br label %[[RETURN]] +// SPIRV64: [[SW_DEFAULT]]: +// SPIRV64-NEXT: unreachable +// SPIRV64: [[RETURN]]: +// SPIRV64-NEXT: [[TMP1:%.*]] = load i32, ptr [[RETVAL]], align 4 +// SPIRV64-NEXT: ret i32 [[TMP1]] +// +// +// SPIRV64-LABEL: define internal spir_func i32 @__gpu_block_id( +// SPIRV64-SAME: i32 noundef [[__DIM:%.*]]) #[[ATTR0]] { +// SPIRV64-NEXT: [[ENTRY:.*:]] +// SPIRV64-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 +// SPIRV64-NEXT: [[__DIM_ADDR:%.*]] = alloca i32, align 4 +// SPIRV64-NEXT: store i32 [[__DIM]], ptr [[__DIM_ADDR]], align 4 +// SPIRV64-NEXT: [[TMP0:%.*]] = load i32, ptr [[__DIM_ADDR]], align 4 +// SPIRV64-NEXT: switch i32 [[TMP0]], label %[[SW_DEFAULT:.*]] [ +// SPIRV64-NEXT: i32 0, label %[[SW_BB:.*]] +// SPIRV64-NEXT: i32 1, label %[[SW_BB1:.*]] +// SPIRV64-NEXT: i32 2, label %[[SW_BB3:.*]] +// SPIRV64-NEXT: ] +// SPIRV64: [[SW_BB]]: +// SPIRV64-NEXT: [[CALL:%.*]] = call spir_func i32 @__gpu_block_id_x() +// SPIRV64-NEXT: store i32 [[CALL]], ptr [[RETVAL]], align 4 +// SPIRV64-NEXT: br label %[[RETURN:.*]] +// SPIRV64: [[SW_BB1]]: +// SPIRV64-NEXT: [[CALL2:%.*]] = call spir_func i32 @__gpu_block_id_y() +// SPIRV64-NEXT: store i32 [[CALL2]], ptr [[RETVAL]], align 4 +// SPIRV64-NEXT: br label %[[RETURN]] +// SPIRV64: [[SW_BB3]]: +// SPIRV64-NEXT: [[CALL4:%.*]] = call spir_func i32 @__gpu_block_id_z() +// SPIRV64-NEXT: store i32 [[CALL4]], ptr [[RETVAL]], align 4 +// SPIRV64-NEXT: br label %[[RETURN]] +// SPIRV64: [[SW_DEFAULT]]: +// SPIRV64-NEXT: unreachable +// SPIRV64: [[RETURN]]: +// SPIRV64-NEXT: [[TMP1:%.*]] = load i32, ptr [[RETVAL]], align 4 +// SPIRV64-NEXT: ret i32 [[TMP1]] +// +// +// SPIRV64-LABEL: define internal spir_func i32 @__gpu_num_threads( +// SPIRV64-SAME: i32 noundef [[__DIM:%.*]]) #[[ATTR0]] { +// SPIRV64-NEXT: [[ENTRY:.*:]] +// SPIRV64-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 +// SPIRV64-NEXT: [[__DIM_ADDR:%.*]] = alloca i32, align 4 +// SPIRV64-NEXT: store i32 [[__DIM]], ptr [[__DIM_ADDR]], align 4 +// SPIRV64-NEXT: [[TMP0:%.*]] = load i32, ptr [[__DIM_ADDR]], align 4 +// SPIRV64-NEXT: switch i32 [[TMP0]], label %[[SW_DEFAULT:.*]] [ +// SPIRV64-NEXT: i32 0, label %[[SW_BB:.*]] +// SPIRV64-NEXT: i32 1, label %[[SW_BB1:.*]] +// SPIRV64-NEXT: i32 2, label %[[SW_BB3:.*]] +// SPIRV64-NEXT: ] +// SPIRV64: [[SW_BB]]: +// SPIRV64-NEXT: [[CALL:%.*]] = call spir_func i32 @__gpu_num_threads_x() +// SPIRV64-NEXT: store i32 [[CALL]], ptr [[RETVAL]], align 4 +// SPIRV64-NEXT: br label %[[RETURN:.*]] +// SPIRV64: [[SW_BB1]]: +// SPIRV64-NEXT: [[CALL2:%.*]] = call spir_func i32 @__gpu_num_threads_y() +// SPIRV64-NEXT: store i32 [[CALL2]], ptr [[RETVAL]], align 4 +// SPIRV64-NEXT: br label %[[RETURN]] +// SPIRV64: [[SW_BB3]]: +// SPIRV64-NEXT: [[CALL4:%.*]] = call spir_func i32 @__gpu_num_threads_z() +// SPIRV64-NEXT: store i32 [[CALL4]], ptr [[RETVAL]], align 4 +// SPIRV64-NEXT: br label %[[RETURN]] +// SPIRV64: [[SW_DEFAULT]]: +// SPIRV64-NEXT: unreachable +// SPIRV64: [[RETURN]]: +// SPIRV64-NEXT: [[TMP1:%.*]] = load i32, ptr [[RETVAL]], align 4 +// SPIRV64-NEXT: ret i32 [[TMP1]] +// +// +// SPIRV64-LABEL: define internal spir_func i32 @__gpu_thread_id( +// SPIRV64-SAME: i32 noundef [[__DIM:%.*]]) #[[ATTR0]] { +// SPIRV64-NEXT: [[ENTRY:.*:]] +// SPIRV64-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 +// SPIRV64-NEXT: [[__DIM_ADDR:%.*]] = alloca i32, align 4 +// SPIRV64-NEXT: store i32 [[__DIM]], ptr [[__DIM_ADDR]], align 4 +// SPIRV64-NEXT: [[TMP0:%.*]] = load i32, ptr [[__DIM_ADDR]], align 4 +// SPIRV64-NEXT: switch i32 [[TMP0]], label %[[SW_DEFAULT:.*]] [ +// SPIRV64-NEXT: i32 0, label %[[SW_BB:.*]] +// SPIRV64-NEXT: i32 1, label %[[SW_BB1:.*]] +// SPIRV64-NEXT: i32 2, label %[[SW_BB3:.*]] +// SPIRV64-NEXT: ] +// SPIRV64: [[SW_BB]]: +// SPIRV64-NEXT: [[CALL:%.*]] = call spir_func i32 @__gpu_thread_id_x() +// SPIRV64-NEXT: store i32 [[CALL]], ptr [[RETVAL]], align 4 +// SPIRV64-NEXT: br label %[[RETURN:.*]] +// SPIRV64: [[SW_BB1]]: +// SPIRV64-NEXT: [[CALL2:%.*]] = call spir_func i32 @__gpu_thread_id_y() +// SPIRV64-NEXT: store i32 [[CALL2]], ptr [[RETVAL]], align 4 +// SPIRV64-NEXT: br label %[[RETURN]] +// SPIRV64: [[SW_BB3]]: +// SPIRV64-NEXT: [[CALL4:%.*]] = call spir_func i32 @__gpu_thread_id_z() +// SPIRV64-NEXT: store i32 [[CALL4]], ptr [[RETVAL]], align 4 +// SPIRV64-NEXT: br label %[[RETURN]] +// SPIRV64: [[SW_DEFAULT]]: +// SPIRV64-NEXT: unreachable +// SPIRV64: [[RETURN]]: +// SPIRV64-NEXT: [[TMP1:%.*]] = load i32, ptr [[RETVAL]], align 4 +// SPIRV64-NEXT: ret i32 [[TMP1]] +// +// +// SPIRV64-LABEL: define internal spir_func i64 @__gpu_read_first_lane_u64( +// SPIRV64-SAME: i64 noundef [[__LANE_MASK:%.*]], i64 noundef [[__X:%.*]]) #[[ATTR0]] { +// SPIRV64-NEXT: [[ENTRY:.*:]] +// SPIRV64-NEXT: [[__LANE_MASK_ADDR:%.*]] = alloca i64, align 8 +// SPIRV64-NEXT: [[__X_ADDR:%.*]] = alloca i64, align 8 +// SPIRV64-NEXT: [[__HI:%.*]] = alloca i32, align 4 +// SPIRV64-NEXT: [[__LO:%.*]] = alloca i32, align 4 +// SPIRV64-NEXT: store i64 [[__LANE_MASK]], ptr [[__LANE_MASK_ADDR]], align 8 +// SPIRV64-NEXT: store i64 [[__X]], ptr [[__X_ADDR]], align 8 +// SPIRV64-NEXT: [[TMP0:%.*]] = load i64, ptr [[__X_ADDR]], align 8 +// SPIRV64-NEXT: [[SHR:%.*]] = lshr i64 [[TMP0]], 32 +// SPIRV64-NEXT: [[CONV:%.*]] = trunc i64 [[SHR]] to i32 +// SPIRV64-NEXT: store i32 [[CONV]], ptr [[__HI]], align 4 +// SPIRV64-NEXT: [[TMP1:%.*]] = load i64, ptr [[__X_ADDR]], align 8 +// SPIRV64-NEXT: [[AND:%.*]] = and i64 [[TMP1]], 4294967295 +// SPIRV64-NEXT: [[CONV1:%.*]] = trunc i64 [[AND]] to i32 +// SPIRV64-NEXT: store i32 [[CONV1]], ptr [[__LO]], align 4 +// SPIRV64-NEXT: [[TMP2:%.*]] = load i64, ptr [[__LANE_MASK_ADDR]], align 8 +// SPIRV64-NEXT: [[TMP3:%.*]] = load i32, ptr [[__HI]], align 4 +// SPIRV64-NEXT: [[CALL:%.*]] = call spir_func i32 @__gpu_read_first_lane_u32(i64 noundef [[TMP2]], i32 noundef [[TMP3]]) +// SPIRV64-NEXT: [[CONV2:%.*]] = zext i32 [[CALL]] to i64 +// SPIRV64-NEXT: [[SHL:%.*]] = shl i64 [[CONV2]], 32 +// SPIRV64-NEXT: [[TMP4:%.*]] = load i64, ptr [[__LANE_MASK_ADDR]], align 8 +// SPIRV64-NEXT: [[TMP5:%.*]] = load i32, ptr [[__LO]], align 4 +// SPIRV64-NEXT: [[CALL3:%.*]] = call spir_func i32 @__gpu_read_first_lane_u32(i64 noundef [[TMP4]], i32 noundef [[TMP5]]) +// SPIRV64-NEXT: [[CONV4:%.*]] = zext i32 [[CALL3]] to i64 +// SPIRV64-NEXT: [[AND5:%.*]] = and i64 [[CONV4]], 4294967295 +// SPIRV64-NEXT: [[OR:%.*]] = or i64 [[SHL]], [[AND5]] +// SPIRV64-NEXT: ret i64 [[OR]] +// +// +// SPIRV64-LABEL: define internal spir_func i64 @__gpu_first_lane_id( +// SPIRV64-SAME: i64 noundef [[__LANE_MASK:%.*]]) #[[ATTR0]] { +// SPIRV64-NEXT: [[ENTRY:.*:]] +// SPIRV64-NEXT: [[__LANE_MASK_ADDR:%.*]] = alloca i64, align 8 +// SPIRV64-NEXT: store i64 [[__LANE_MASK]], ptr [[__LANE_MASK_ADDR]], align 8 +// SPIRV64-NEXT: [[TMP0:%.*]] = load i64, ptr [[__LANE_MASK_ADDR]], align 8 +// SPIRV64-NEXT: [[TMP1:%.*]] = call i64 @llvm.cttz.i64(i64 [[TMP0]], i1 true) +// SPIRV64-NEXT: [[TMP2:%.*]] = add i64 [[TMP1]], 1 +// SPIRV64-NEXT: [[ISZERO:%.*]] = icmp eq i64 [[TMP0]], 0 +// SPIRV64-NEXT: [[FFS:%.*]] = select i1 [[ISZERO]], i64 0, i64 [[TMP2]] +// SPIRV64-NEXT: [[CAST:%.*]] = trunc i64 [[FFS]] to i32 +// SPIRV64-NEXT: [[SUB:%.*]] = sub nsw i32 [[CAST]], 1 +// SPIRV64-NEXT: [[CONV:%.*]] = sext i32 [[SUB]] to i64 +// SPIRV64-NEXT: ret i64 [[CONV]] +// +// +// SPIRV64-LABEL: define internal spir_func zeroext i1 @__gpu_is_first_in_lane( +// SPIRV64-SAME: i64 noundef [[__LANE_MASK:%.*]]) #[[ATTR0]] { +// SPIRV64-NEXT: [[ENTRY:.*:]] +// SPIRV64-NEXT: [[__LANE_MASK_ADDR:%.*]] = alloca i64, align 8 +// SPIRV64-NEXT: store i64 [[__LANE_MASK]], ptr [[__LANE_MASK_ADDR]], align 8 +// SPIRV64-NEXT: [[CALL:%.*]] = call spir_func i32 @__gpu_lane_id() +// SPIRV64-NEXT: [[CONV:%.*]] = zext i32 [[CALL]] to i64 +// SPIRV64-NEXT: [[TMP0:%.*]] = load i64, ptr [[__LANE_MASK_ADDR]], align 8 +// SPIRV64-NEXT: [[CALL1:%.*]] = call spir_func i64 @__gpu_first_lane_id(i64 noundef [[TMP0]]) +// SPIRV64-NEXT: [[CMP:%.*]] = icmp eq i64 [[CONV]], [[CALL1]] +// SPIRV64-NEXT: ret i1 [[CMP]] +// //. // AMDGPU: [[RNG3]] = !{i32 1, i32 0} // AMDGPU: [[META4]] = !{} diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td index 62239ca705b9e..c0613786a13bb 100644 --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -2861,6 +2861,69 @@ def int_experimental_convergence_anchor def int_experimental_convergence_loop : DefaultAttrsIntrinsic<[llvm_token_ty], [], [IntrNoMem, IntrConvergent]>; +//===------- GPU Intrinsics -----------------------------------------------===// + +class GPUIntrinsic<LLVMType ret_type, string name> + : DefaultAttrsIntrinsic<[ret_type], + [], + [NoUndef<RetIndex>, IntrNoMem, IntrSpeculatable]>, + ClangBuiltin<name>; + +multiclass GPUGridIntrinsic_xyz<string prefix> { + def _x : GPUIntrinsic<llvm_i32_ty, !strconcat(prefix, "_x")>; + def _y : GPUIntrinsic<llvm_i32_ty, !strconcat(prefix, "_y")>; + def _z : GPUIntrinsic<llvm_i32_ty, !strconcat(prefix, "_z")>; +} + +defm int_gpu_num_blocks : GPUGridIntrinsic_xyz<"__builtin_gpu_num_blocks">; +defm int_gpu_block_id : GPUGridIntrinsic_xyz<"__builtin_gpu_block_id">; +defm int_gpu_num_threads : GPUGridIntrinsic_xyz<"__builtin_gpu_num_threads">; +defm int_gpu_thread_id : GPUGridIntrinsic_xyz<"__builtin_gpu_thread_id">; + +def int_gpu_num_lanes : GPUIntrinsic<llvm_i32_ty,"__builtin_gpu_num_lanes">; +def int_gpu_lane_id : GPUIntrinsic<llvm_i32_ty,"__builtin_gpu_lane_id">; +def int_gpu_lane_mask : GPUIntrinsic<llvm_i64_ty,"__builtin_gpu_lane_mask">; + +def int_gpu_read_first_lane_u32 : DefaultAttrsIntrinsic<[llvm_i32_ty], + [llvm_i64_ty, llvm_i32_ty], + [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>, + ClangBuiltin<"__builtin_gpu_read_first_lane_u32">; + +def int_gpu_shuffle_idx_u32 : DefaultAttrsIntrinsic<[llvm_i32_ty], + [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [IntrNoMem, IntrConvergent, IntrSpeculatable, IntrWillReturn, IntrNoCallback, IntrNoFree]>, + ClangBuiltin<"__builtin_gpu_shuffle_idx_u32">; + +def int_gpu_ballot : DefaultAttrsIntrinsic<[llvm_i64_ty], + [llvm_i64_ty, llvm_i1_ty], + [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>, + ClangBuiltin<"__builtin_gpu_ballot">; + +def int_gpu_sync_threads : DefaultAttrsIntrinsic<[], [], + // todo, attributes match barrier, but there's a fence in here too + // also why is there a fence in here? + [IntrNoMem, IntrHasSideEffects, IntrConvergent, + IntrWillReturn, IntrNoCallback, IntrNoFree]>, + ClangBuiltin<"__builtin_gpu_sync_threads">; + +def int_gpu_sync_lane : DefaultAttrsIntrinsic<[], + [llvm_i64_ty], + [IntrNoMem, IntrHasSideEffects, IntrConvergent, + IntrWillReturn, IntrNoCallback, IntrNoFree]>, + ClangBuiltin<"__builtin_gpu_sync_lane">; + +def int_gpu_exit : DefaultAttrsIntrinsic<[], + [], + [IntrNoReturn, IntrConvergent]>, + ClangBuiltin<"__builtin_gpu_exit">; + + +def int_gpu_thread_suspend : DefaultAttrsIntrinsic<[], + [], + [IntrWillReturn, IntrNoMem, IntrHasSideEffects, IntrConvergent]>, + ClangBuiltin<"__builtin_gpu_thread_suspend">; + + //===----------------------------------------------------------------------===// // Target-specific intrinsics //===----------------------------------------------------------------------===// diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 86e050333acc7..91095f2880d03 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -150,6 +150,8 @@ defm int_amdgcn_workitem_id : AMDGPUReadPreloadRegisterIntrinsic_xyz; defm int_amdgcn_workgroup_id : AMDGPUReadPreloadRegisterIntrinsic_xyz_named <"__builtin_amdgcn_workgroup_id">; +defm int_amdgcn_grid_size : AMDGPUReadPreloadRegisterIntrinsic_xyz; + def int_amdgcn_dispatch_ptr : DefaultAttrsIntrinsic<[LLVMQualPointerType<4>], [], [Align<RetIndex, 4>, NoUndef<RetIndex>, NonNull<RetIndex>, IntrNoMem, IntrSpeculatable]>; diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h index 78ff93019fd7e..9cbe7b068c394 100644 --- a/llvm/include/llvm/InitializePasses.h +++ b/llvm/include/llvm/InitializePasses.h @@ -175,6 +175,7 @@ void initializeLoopUnrollPass(PassRegistry &); void initializeLowerAtomicLegacyPassPass(PassRegistry &); void initializeLowerEmuTLSPass(PassRegistry &); void initializeLowerGlobalDtorsLegacyPassPass(PassRegistry &); +void initializeLowerGPUIntrinsicPass(PassRegistry &); void initializeLowerIntrinsicsPass(PassRegistry &); void initializeLowerInvokeLegacyPassPass(PassRegistry &); void initializeLowerSwitchLegacyPassPass(PassRegistry &); diff --git a/llvm/include/llvm/Transforms/Scalar.h b/llvm/include/llvm/Transforms/Scalar.h index fc772a7639c47..d746b7a39b871 100644 --- a/llvm/include/llvm/Transforms/Scalar.h +++ b/llvm/include/llvm/Transforms/Scalar.h @@ -135,6 +135,12 @@ FunctionPass *createSinkingPass(); // Pass *createLowerAtomicPass(); +//===----------------------------------------------------------------------===// +// +// LowerGPUIntrinsic - Lower GPU intrinsics +// +Pass *createLowerGPUIntrinsicPass(); + //===----------------------------------------------------------------------===// // // MergeICmps - Merge integer comparison chains into a memcmp diff --git a/llvm/include/llvm/Transforms/Scalar/LowerGPUIntrinsic.h b/llvm/include/llvm/Transforms/Scalar/LowerGPUIntrinsic.h new file mode 100644 index 0000000000000..6e793d4965287 --- /dev/null +++ b/llvm/include/llvm/Transforms/Scalar/LowerGPUIntrinsic.h @@ -0,0 +1,26 @@ +//===--- LowerGPUIntrinsic.h - Lower GPU intrinsics -------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This pass lowers GPU intrinsics. +// +//===----------------------------------------------------------------------===// +#ifndef LLVM_TRANSFORMS_SCALAR_LOWERGPUINTRINSIC_H +#define LLVM_TRANSFORMS_SCALAR_LOWERGPUINTRINSIC_H + +#include "llvm/IR/PassManager.h" + +namespace llvm { + +struct LowerGPUIntrinsicPass : public PassInfoMixin<LowerGPUIntrinsicPass> { + PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); + + static bool isRequired() { return true; } // otherwise O0 doesn't run it +}; +} // namespace llvm + +#endif // LLVM_TRANSFORMS_SCALAR_LOWERGPUINTRINSIC_H diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp index 99e78d3b6feb8..32234df9a364f 100644 --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -295,6 +295,7 @@ #include "llvm/Transforms/Scalar/LowerAtomicPass.h" #include "llvm/Transforms/Scalar/LowerConstantIntrinsics.h" #include "llvm/Transforms/Scalar/LowerExpectIntrinsic.h" +#include "llvm/Transforms/Scalar/LowerGPUIntrinsic.h" #include "llvm/Transforms/Scalar/LowerGuardIntrinsic.h" #include "llvm/Transforms/Scalar/LowerMatrixIntrinsics.h" #include "llvm/Transforms/Scalar/LowerWidenableCondition.h" diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def index 586d4b7e02fc1..f24b0a2e37329 100644 --- a/llvm/lib/Passes/PassRegistry.def +++ b/llvm/lib/Passes/PassRegistry.def @@ -96,6 +96,7 @@ MODULE_PASS("iroutliner", IROutlinerPass()) MODULE_PASS("jmc-instrumenter", JMCInstrumenterPass()) MODULE_PASS("lower-emutls", LowerEmuTLSPass()) MODULE_PASS("lower-global-dtors", LowerGlobalDtorsPass()) +MODULE_PASS("lower-gpu-intrinsic", LowerGPUIntrinsicPass()) MODULE_PASS("lower-ifunc", LowerIFuncPass()) MODULE_PASS("lowertypetests", LowerTypeTestsPass()) MODULE_PASS("fatlto-cleanup", FatLtoCleanup()) diff --git a/llvm/lib/Transforms/Scalar/CMakeLists.txt b/llvm/lib/Transforms/Scalar/CMakeLists.txt index 84a5b02043d01..f35c81f2e661b 100644 --- a/llvm/lib/Transforms/Scalar/CMakeLists.txt +++ b/llvm/lib/Transforms/Scalar/CMakeLists.txt @@ -51,6 +51,7 @@ add_llvm_component_library(LLVMScalarOpts LowerAtomicPass.cpp LowerConstantIntrinsics.cpp LowerExpectIntrinsic.cpp + LowerGPUIntrinsic.cpp LowerGuardIntrinsic.cpp LowerMatrixIntrinsics.cpp LowerWidenableCondition.cpp diff --git a/llvm/lib/Transforms/Scalar/LowerGPUIntrinsic.cpp b/llvm/lib/Transforms/Scalar/LowerGPUIntrinsic.cpp new file mode 100644 index 0000000000000..6a0b70e52c309 --- /dev/null +++ b/llvm/lib/Transforms/Scalar/LowerGPUIntrinsic.cpp @@ -0,0 +1,501 @@ +//===- LowerGPUIntrinsic.cpp ----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Lower the llvm.gpu intrinsics to target specific code sequences. +// Can be called from clang if building for a specific GPU or from the backend +// as part of a SPIRV lowering pipeline. Initial pass can lower to amdgcn or +// nvptx, adding further architectures means adding a column to the lookup table +// and further intrinsics adding a row. +// +// The idea is for the intrinsics to represent a thin abstraction over the +// different GPU architectures. In particular, code compiled to spirv-- without +// specifying a specific target can be specialised at JIT time, at which point +// this pass will rewrite those intrinsics to ones that the current backend +// knows. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Scalar/LowerGPUIntrinsic.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/IR/ConstantRange.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InlineAsm.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/IntrinsicsAMDGPU.h" +#include "llvm/IR/IntrinsicsNVPTX.h" +#include "llvm/IR/MDBuilder.h" +#include "llvm/IR/Module.h" +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" +#include "llvm/Target/TargetOptions.h" +#include "llvm/TargetParser/Triple.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" + +#define DEBUG_TYPE "lower-gpu-intrinsic" + +using namespace llvm; + +namespace { + +// For each intrinsic, specify what function to call to lower it +typedef bool (*lowerFunction)(Module &M, IRBuilder<> &, Intrinsic::ID from, + CallBase *CI); + +// Simple lowering, directly replace the intrinsic with a different one +// with the same type, and optionally refine range metadata on the return value +template <Intrinsic::ID To> +bool S(Module &M, IRBuilder<> &, Intrinsic::ID from, CallBase *CI) { + + static_assert(To != Intrinsic::not_intrinsic); + Intrinsic::ID GenericID = from; + Intrinsic::ID SpecificID = To; + + bool Changed = false; + Function *Generic = Intrinsic::getDeclarationIfExists(&M, GenericID); + auto *Specific = Intrinsic::getOrInsertDeclaration(&M, SpecificID); + + if ((Generic->getType() != Specific->getType()) || + (Generic->getReturnType() != Specific->getReturnType())) + report_fatal_error("LowerGPUIntrinsic: Inconsistent types between " + "intrinsics in lookup table"); + + CI->setCalledFunction(Specific); + Changed = true; + + return Changed; +} + +// Replace intrinsic call with a linear sequence of instructions +typedef Value *(*builder)(Module &M, IRBuilder<> &Builder, Intrinsic::ID from, + CallBase *CI); + +template <builder F> +bool B(Module &M, IRBuilder<> &Builder, Intrinsic::ID from, CallBase *CI) { + bool Changed = false; + + Builder.SetInsertPoint(CI); + + Value *replacement = F(M, Builder, from, CI); + if (replacement) { + CI->replaceAllUsesWith(replacement); + CI->eraseFromParent(); + Changed = true; + } + + return Changed; +} + +template <Intrinsic::ID Numerator, Intrinsic::ID Denominator> +Value *intrinsicRatio(Module &M, IRBuilder<> &Builder, Intrinsic::ID, + CallBase *) { + Value *N = Builder.CreateIntrinsic(Numerator, {}, {}); + Value *D = Builder.CreateIntrinsic(Denominator, {}, {}); + return Builder.CreateUDiv(N, D); +} + +namespace amdgpu { +Value *lane_mask(Module &M, IRBuilder<> &Builder, Intrinsic::ID from, + CallBase *CI) { + auto &Ctx = M.getContext(); + return Builder.CreateIntrinsic( + Intrinsic::amdgcn_ballot, {Type::getInt64Ty(Ctx)}, + {ConstantInt::get(Type::getInt1Ty(Ctx), true)}); +} + +Value *lane_id(Module &M, IRBuilder<> &Builder, Intrinsic::ID from, + CallBase *CI) { + auto &Ctx = M.getContext(); + Constant *M1 = ConstantInt::get(Type::getInt32Ty(Ctx), -1); + Constant *Z = ConstantInt::get(Type::getInt32Ty(Ctx), 0); + + CallInst *Lo = + Builder.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {}, {M1, Z}); + return Builder.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {}, {M1, Lo}); +} + +Value *first_lane(Module &M, IRBuilder<> &Builder, Intrinsic::ID from, + CallBase *CI) { + auto &Ctx = M.getContext(); + return Builder.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, + {Type::getInt32Ty(Ctx)}, + {CI->getArgOperand(1)}); +} + +Value *shuffle_idx(Module &M, IRBuilder<> &Builder, Intrinsic::ID from, + CallBase *CI) { + auto &Ctx = M.getContext(); + + Value *idx = CI->getArgOperand(1); + Value *x = CI->getArgOperand(2); + Value *width = CI->getArgOperand(3); + + Value *id = Builder.CreateIntrinsic(Intrinsic::gpu_lane_id, {}, {}); + + Value *n = Builder.CreateSub(ConstantInt::get(Type::getInt32Ty(Ctx), 0), + width, "not"); + Value *a = Builder.CreateAnd(id, n, "and"); + Value *add = Builder.CreateAdd(a, idx, "add"); + Value *shl = + Builder.CreateShl(add, ConstantInt::get(Type::getInt32Ty(Ctx), 2), "shl"); + return Builder.CreateIntrinsic(Intrinsic::amdgcn_ds_bpermute, {}, {shl, x}); +} + +Value *ballot(Module &M, IRBuilder<> &Builder, Intrinsic::ID from, + CallBase *CI) { + auto &Ctx = M.getContext(); + + Value *C = + Builder.CreateIntrinsic(Intrinsic::amdgcn_ballot, {Type::getInt64Ty(Ctx)}, + {CI->getArgOperand(1)}); + + return Builder.CreateAnd(C, CI->getArgOperand(0)); +} + +Value *sync_threads(Module &M, IRBuilder<> &Builder, Intrinsic::ID from, + CallBase *CI) { + auto &Ctx = M.getContext(); + Builder.CreateIntrinsic(Intrinsic::amdgcn_s_barrier, {}, {}); + + Value *F = Builder.CreateFence(AtomicOrdering::SequentiallyConsistent, + Ctx.getOrInsertSyncScopeID("workgroup")); + + return F; +} + +Value *sync_lane(Module &M, IRBuilder<> &Builder, Intrinsic::ID from, + CallBase *CI) { + return Builder.CreateIntrinsic(Intrinsic::amdgcn_wave_barrier, {}, {}); +} + +Value *thread_suspend(Module &M, IRBuilder<> &Builder, Intrinsic::ID from, + CallBase *CI) { + + auto &Ctx = M.getContext(); + return Builder.CreateIntrinsic(Intrinsic::amdgcn_s_sleep, {}, + {ConstantInt::get(Type::getInt32Ty(Ctx), 2)}); +} + +Value *dispatch_ptr(IRBuilder<> &Builder) { + CallInst *Call = + Builder.CreateIntrinsic(Intrinsic::amdgcn_dispatch_ptr, {}, {}); + Call->addRetAttr( + Attribute::getWithDereferenceableBytes(Call->getContext(), 64)); + Call->addRetAttr(Attribute::getWithAlignment(Call->getContext(), Align(4))); + return Call; +} + +Value *implicit_arg_ptr(IRBuilder<> &Builder) { + CallInst *Call = + Builder.CreateIntrinsic(Intrinsic::amdgcn_implicitarg_ptr, {}, {}); + Call->addRetAttr( + Attribute::getWithDereferenceableBytes(Call->getContext(), 256)); + Call->addRetAttr(Attribute::getWithAlignment(Call->getContext(), Align(8))); + return Call; +} + +template <unsigned Index> +Value *grid_size(Module &M, IRBuilder<> &Builder, Intrinsic::ID, CallBase *) { + auto &Ctx = M.getContext(); + const unsigned XOffset = 12; + auto *DP = dispatch_ptr(Builder); + + // Indexing the HSA kernel_dispatch_packet struct. + auto *Offset = ConstantInt::get(Type::getInt32Ty(Ctx), XOffset + Index * 4); + auto *GEP = Builder.CreateGEP(Type::getInt8Ty(Ctx), DP, Offset); + auto *LD = Builder.CreateLoad(Type::getInt32Ty(Ctx), GEP); + llvm::MDBuilder MDB(Ctx); + // Known non-zero. + LD->setMetadata(llvm::LLVMContext::MD_range, + MDB.createRange(APInt(32, 1), APInt::getZero(32))); + LD->setMetadata(llvm::LLVMContext::MD_invariant_load, + llvm::MDNode::get(Ctx, {})); + return LD; +} + +template <int Index> +Value *WGSize(Module &M, IRBuilder<> &Builder, Intrinsic::ID from, + CallBase *CI) { + + // Note: "__oclc_ABI_version" is supposed to be emitted and initialized by + // clang during compilation of user code. + StringRef Name = "__oclc_ABI_version"; + auto *ABIVersionC = M.getNamedGlobal(Name); + if (!ABIVersionC) { + // In CGBuiltin, we'd have to create an extern variable to emit the load for + // Here, we can leave the intrinsic in place and it'll get lowered later + return nullptr; + } + auto &Ctx = M.getContext(); + + Value *ABIVersion = Builder.CreateLoad(Type::getInt32Ty(Ctx), ABIVersionC); + + Value *IsCOV5 = Builder.CreateICmpSGE( + ABIVersion, + ConstantInt::get(Type::getInt32Ty(Ctx), CodeObjectVersionKind::COV_5)); + + Value *ImplicitGEP = Builder.CreateConstGEP1_32( + Type::getInt8Ty(Ctx), implicit_arg_ptr(Builder), 12 + Index * 2); + + // Indexing the HSA kernel_dispatch_packet struct. + Value *DispatchGEP = Builder.CreateConstGEP1_32( + Type::getInt8Ty(Ctx), dispatch_ptr(Builder), 4 + Index * 2); + + auto Result = Builder.CreateSelect(IsCOV5, ImplicitGEP, DispatchGEP); + LoadInst *LD = Builder.CreateLoad(Type::getInt16Ty(Ctx), Result); + + // TODO: CGBuiltin digs MaxOpenCLWorkGroupSize out of targetinfo and limtis + // the range on the load based on that (MD_range) + + LD->setMetadata(llvm::LLVMContext::MD_noundef, llvm::MDNode::get(Ctx, {})); + LD->setMetadata(llvm::LLVMContext::MD_invariant_load, + llvm::MDNode::get(Ctx, {})); + + // The workgroup size is a uint16_t but gpu_block_id returns a uint32_t + return Builder.CreateZExt(LD, Type::getInt32Ty(Ctx)); +} + +} // namespace amdgpu + +namespace nvptx { +Value *lane_mask(Module &M, IRBuilder<> &Builder, Intrinsic::ID from, + CallBase *CI) { + auto &Ctx = M.getContext(); + CallInst *C = Builder.CreateIntrinsic(Intrinsic::nvvm_activemask, {}, {}); + return Builder.CreateZExt(C, Type::getInt64Ty(Ctx), "conv"); +} + +Value *first_lane(Module &M, IRBuilder<> &Builder, Intrinsic::ID from, + CallBase *CI) { + auto &Ctx = M.getContext(); + Value *conv = + Builder.CreateTrunc(CI->getArgOperand(0), Type::getInt32Ty(Ctx), "conv"); + Value *C = Builder.CreateIntrinsic( + Intrinsic::cttz, {Type::getInt32Ty(Ctx)}, + {conv, ConstantInt::get(Type::getInt1Ty(Ctx), true)}); + Value *iszero = Builder.CreateICmpEQ( + conv, ConstantInt::get(Type::getInt32Ty(Ctx), 0), "iszero"); + Value *sub = Builder.CreateSelect( + iszero, ConstantInt::get(Type::getInt32Ty(Ctx), -1), C, "sub"); + + return Builder.CreateIntrinsic(Intrinsic::nvvm_shfl_sync_idx_i32, {}, + {conv, CI->getArgOperand(1), sub, + ConstantInt::get(Type::getInt32Ty(Ctx), 31)}); +} + +Value *shuffle_idx(Module &M, IRBuilder<> &Builder, Intrinsic::ID from, + CallBase *CI) { + auto &Ctx = M.getContext(); + + Value *lane_mask = CI->getArgOperand(0); + Value *idx = CI->getArgOperand(1); + Value *x = CI->getArgOperand(2); + Value *width = CI->getArgOperand(3); + + Value *Conv = Builder.CreateTrunc(lane_mask, Type::getInt32Ty(Ctx), "conv"); + + Value *sh_prom = Builder.CreateZExt(idx, Type::getInt64Ty(Ctx), "sh_prom"); + Value *shl0 = + Builder.CreateShl(width, ConstantInt::get(Type::getInt32Ty(Ctx), 8)); + Value *or0 = Builder.CreateSub(ConstantInt::get(Type::getInt32Ty(Ctx), 8223), + shl0, "or"); + + Value *core = Builder.CreateIntrinsic(Intrinsic::nvvm_shfl_sync_idx_i32, {}, + {Conv, x, idx, or0}); + + Value *shl1 = + Builder.CreateShl(ConstantInt::get(Type::getInt64Ty(Ctx), 1), sh_prom); + Value *and0 = Builder.CreateAnd(shl1, lane_mask); + Value *cmp = + Builder.CreateICmpEQ(and0, ConstantInt::get(Type::getInt64Ty(Ctx), 0)); + Value *and4 = Builder.CreateSelect( + cmp, ConstantInt::get(Type::getInt32Ty(Ctx), 0), core, "and4"); + + return and4; +} + +Value *ballot(Module &M, IRBuilder<> &Builder, Intrinsic::ID from, + CallBase *CI) { + auto &Ctx = M.getContext(); + Value *Conv = + Builder.CreateTrunc(CI->getArgOperand(0), Type::getInt32Ty(Ctx), "conv"); + Value *C = Builder.CreateIntrinsic(Intrinsic::nvvm_vote_ballot_sync, {}, + {Conv, CI->getArgOperand(1)}); + + return Builder.CreateZExt(C, Type::getInt64Ty(Ctx), "conv"); +} + +Value *sync_lane(Module &M, IRBuilder<> &Builder, Intrinsic::ID from, + CallBase *CI) { + + auto &Ctx = M.getContext(); + Value *X = Builder.CreateTrunc(CI->getArgOperand(0), Type::getInt32Ty(Ctx)); + return Builder.CreateIntrinsic(Intrinsic::nvvm_bar_warp_sync, {}, {X}); +} + +Value *thread_suspend(Module &M, IRBuilder<> &Builder, Intrinsic::ID from, + CallBase *CI) { + + auto &Ctx = M.getContext(); + const DataLayout &DL = M.getDataLayout(); + + Value *str = Builder.CreateGlobalString( + "__CUDA_ARCH", "", DL.getDefaultGlobalsAddressSpace(), &M); + + Builder.SetInsertPoint(CI); + Value *Reflect = Builder.CreateIntrinsic(Intrinsic::nvvm_reflect, {}, {str}); + Value *Cmp = Builder.CreateICmpUGT( + Reflect, ConstantInt::get(Type::getInt32Ty(Ctx), 699)); + + Builder.SetInsertPoint(SplitBlockAndInsertIfThen(Cmp, CI, false)); + + Builder.CreateIntrinsic(Intrinsic::nvvm_nanosleep, {}, + {ConstantInt::get(Type::getInt32Ty(Ctx), 64)}); + + CI->eraseFromParent(); + return nullptr; // All done +} + +} // namespace nvptx + +struct IntrinsicMap { + Intrinsic::ID Generic; + lowerFunction AMDGPU; + lowerFunction NVPTX; +}; + +using namespace Intrinsic; + +static const IntrinsicMap ls[] = { + // This table of intrinsic => what to do with it is walked in order. + // A row can create calls to intrinsics that are expanded in subsequent rows + // but that does mean that the order of rows is somewhat significant. + // S<intrinsic> is a simple lowering to an existing intrinsic + // B<function> involves building a short sequence of instructions + + // amdgpu defines num_blocks as an integer ratio of two other intrinsics. + // amdgcn_grid_size and gpu_num_threads are expanded further down. + { + gpu_num_blocks_x, + B<intrinsicRatio<amdgcn_grid_size_x, gpu_num_threads_x>>, + S<nvvm_read_ptx_sreg_nctaid_x>, + }, + { + gpu_num_blocks_y, + B<intrinsicRatio<amdgcn_grid_size_y, gpu_num_threads_y>>, + S<nvvm_read_ptx_sreg_nctaid_y>, + }, + { + gpu_num_blocks_z, + B<intrinsicRatio<amdgcn_grid_size_z, gpu_num_threads_z>>, + S<nvvm_read_ptx_sreg_nctaid_z>, + }, + + // Note: Could canonicalise in favour of the target agnostic one without + // breaking existing users of builtin or intrinsic: + // {amdgcn_workgroup_id_x, S<gpu_block_id_x>, nullptr}, + // {gpu_block_id_x, nullptr, S<nvvm_read_ptx_sreg_ctaid_x>}, + // Using the target agnostic one throughout the rest of the backend would + // work fine, and amdgpu-no-workgroup-id-x attribute and similar may be + // applicable to other targets. + // Map {block,thread}_id onto existing intrinsics for the time being. + {gpu_block_id_x, S<amdgcn_workgroup_id_x>, S<nvvm_read_ptx_sreg_ctaid_x>}, + {gpu_block_id_y, S<amdgcn_workgroup_id_y>, S<nvvm_read_ptx_sreg_ctaid_y>}, + {gpu_block_id_z, S<amdgcn_workgroup_id_z>, S<nvvm_read_ptx_sreg_ctaid_z>}, + {gpu_thread_id_x, S<amdgcn_workitem_id_x>, S<nvvm_read_ptx_sreg_tid_x>}, + {gpu_thread_id_y, S<amdgcn_workitem_id_y>, S<nvvm_read_ptx_sreg_tid_y>}, + {gpu_thread_id_z, S<amdgcn_workitem_id_z>, S<nvvm_read_ptx_sreg_tid_z>}, + + // CGBuiltin maps builtin_amdgcn_workgroup_size onto gpu_num_threads + {gpu_num_threads_x, B<amdgpu::WGSize<0>>, S<nvvm_read_ptx_sreg_ntid_x>}, + {gpu_num_threads_y, B<amdgpu::WGSize<1>>, S<nvvm_read_ptx_sreg_ntid_y>}, + {gpu_num_threads_z, B<amdgpu::WGSize<2>>, S<nvvm_read_ptx_sreg_ntid_z>}, + + // Some of the following intrinsics need minor impedance matching + {gpu_num_lanes, S<amdgcn_wavefrontsize>, S<nvvm_read_ptx_sreg_warpsize>}, + {gpu_lane_mask, B<amdgpu::lane_mask>, B<nvptx::lane_mask>}, + + {gpu_read_first_lane_u32, B<amdgpu::first_lane>, B<nvptx::first_lane>}, + {gpu_shuffle_idx_u32, B<amdgpu::shuffle_idx>, B<nvptx::shuffle_idx>}, + + // shuffle sometimes emits call into lane_id so lower lane_id after shuffle + {gpu_lane_id, B<amdgpu::lane_id>, S<nvvm_read_ptx_sreg_laneid>}, + + {gpu_ballot, B<amdgpu::ballot>, B<nvptx::ballot>}, + + {gpu_sync_threads, B<amdgpu::sync_threads>, S<nvvm_barrier0>}, + {gpu_sync_lane, B<amdgpu::sync_lane>, B<nvptx::sync_lane>}, + + {gpu_thread_suspend, B<amdgpu::thread_suspend>, B<nvptx::thread_suspend>}, + {gpu_exit, S<amdgcn_endpgm>, S<nvvm_exit>}, + + // These aren't generic intrinsics but lowering them here instead of + // in CGBuiltin allows the above to be implemented partly in terms of + // amdgcn_grid_size. + {amdgcn_grid_size_x, B<amdgpu::grid_size<0>>, nullptr}, + {amdgcn_grid_size_y, B<amdgpu::grid_size<1>>, nullptr}, + {amdgcn_grid_size_z, B<amdgpu::grid_size<2>>, nullptr}, +}; + +class LowerGPUIntrinsic : public ModulePass { +public: + static char ID; + + LowerGPUIntrinsic() : ModulePass(ID) {} + + bool runOnModule(Module &M) override; +}; + +bool LowerGPUIntrinsic::runOnModule(Module &M) { + bool Changed = false; + + Triple TT(M.getTargetTriple()); + + if (!TT.isAMDGPU() && !TT.isNVPTX()) { + return Changed; + } + + auto &Ctx = M.getContext(); + IRBuilder<> Builder(Ctx); + + for (const IntrinsicMap &I : ls) { + auto *Intr = Intrinsic::getDeclarationIfExists(&M, I.Generic); + if (!Intr) + continue; + + lowerFunction maybeLowering = TT.isAMDGPU() ? I.AMDGPU : I.NVPTX; + if (maybeLowering == nullptr) + continue; + + for (auto *U : make_early_inc_range(Intr->users())) { + if (auto *CI = dyn_cast<CallBase>(U)) { + if (CI->getCalledFunction() == Intr) + Changed |= maybeLowering(M, Builder, I.Generic, CI); + } + } + } + + return Changed; +} + +} // namespace + +char LowerGPUIntrinsic::ID = 0; + +INITIALIZE_PASS(LowerGPUIntrinsic, DEBUG_TYPE, "Lower GPU Intrinsic", false, + false) + +Pass *llvm::createLowerGPUIntrinsicPass() { return new LowerGPUIntrinsic(); } + +PreservedAnalyses LowerGPUIntrinsicPass::run(Module &M, + ModuleAnalysisManager &) { + return LowerGPUIntrinsic().runOnModule(M) ? PreservedAnalyses::none() + : PreservedAnalyses::all(); +} diff --git a/llvm/lib/Transforms/Scalar/Scalar.cpp b/llvm/lib/Transforms/Scalar/Scalar.cpp index c7e4a3e824700..66ef7ebfa5fe5 100644 --- a/llvm/lib/Transforms/Scalar/Scalar.cpp +++ b/llvm/lib/Transforms/Scalar/Scalar.cpp @@ -34,6 +34,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) { initializeLoopTermFoldPass(Registry); initializeLoopUnrollPass(Registry); initializeLowerAtomicLegacyPassPass(Registry); + initializeLowerGPUIntrinsicPass(Registry); initializeMergeICmpsLegacyPassPass(Registry); initializeNaryReassociateLegacyPassPass(Registry); initializePartiallyInlineLibCallsLegacyPassPass(Registry); diff --git a/llvm/test/CodeGen/SPIRV/gpu_intrinsics.ll b/llvm/test/CodeGen/SPIRV/gpu_intrinsics.ll new file mode 100644 index 0000000000000..41c14bc323e61 --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/gpu_intrinsics.ll @@ -0,0 +1,427 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes +; RUN: opt -S -mtriple=amdgcn-- -passes=lower-gpu-intrinsic < %s | FileCheck %s --check-prefix=AMDGCN +; RUN: opt -S -mtriple=nvptx64-- -passes=lower-gpu-intrinsic < %s | FileCheck %s --check-prefix=NVPTX + +; Used by amdgpu to lower llvm.gpu.num.threads, harmless on nvptx +@__oclc_ABI_version = weak_odr hidden addrspace(4) constant i32 500 + +define i32 @num_blocks_x() { +; AMDGCN-LABEL: @num_blocks_x( +; AMDGCN-NEXT: [[TMP1:%.*]] = call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() +; AMDGCN-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP1]], i32 12 +; AMDGCN-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[TMP2]], align 4, !range [[RNG0:![0-9]+]], !invariant.load [[META1:![0-9]+]] +; AMDGCN-NEXT: [[TMP12:%.*]] = load i32, ptr addrspace(4) @__oclc_ABI_version, align 4 +; AMDGCN-NEXT: [[TMP13:%.*]] = icmp sge i32 [[TMP12]], 500 +; AMDGCN-NEXT: [[TMP6:%.*]] = call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; AMDGCN-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP6]], i32 12 +; AMDGCN-NEXT: [[TMP8:%.*]] = call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() +; AMDGCN-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP8]], i32 4 +; AMDGCN-NEXT: [[TMP10:%.*]] = select i1 [[TMP13]], ptr addrspace(4) [[TMP7]], ptr addrspace(4) [[TMP9]] +; AMDGCN-NEXT: [[TMP11:%.*]] = load i16, ptr addrspace(4) [[TMP10]], align 2, !invariant.load [[META1]], !noundef [[META1]] +; AMDGCN-NEXT: [[TMP4:%.*]] = zext i16 [[TMP11]] to i32 +; AMDGCN-NEXT: [[TMP5:%.*]] = udiv i32 [[TMP3]], [[TMP4]] +; AMDGCN-NEXT: ret i32 [[TMP5]] +; +; NVPTX-LABEL: @num_blocks_x( +; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.x() +; NVPTX-NEXT: ret i32 [[TMP1]] +; + %1 = call i32 @llvm.gpu.num.blocks.x() + ret i32 %1 +} + +declare i32 @llvm.gpu.num.blocks.x() + +define i32 @num_blocks_y() { +; AMDGCN-LABEL: @num_blocks_y( +; AMDGCN-NEXT: [[TMP1:%.*]] = call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() +; AMDGCN-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP1]], i32 16 +; AMDGCN-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[TMP2]], align 4, !range [[RNG0]], !invariant.load [[META1]] +; AMDGCN-NEXT: [[TMP12:%.*]] = load i32, ptr addrspace(4) @__oclc_ABI_version, align 4 +; AMDGCN-NEXT: [[TMP13:%.*]] = icmp sge i32 [[TMP12]], 500 +; AMDGCN-NEXT: [[TMP6:%.*]] = call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; AMDGCN-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP6]], i32 14 +; AMDGCN-NEXT: [[TMP8:%.*]] = call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() +; AMDGCN-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP8]], i32 6 +; AMDGCN-NEXT: [[TMP10:%.*]] = select i1 [[TMP13]], ptr addrspace(4) [[TMP7]], ptr addrspace(4) [[TMP9]] +; AMDGCN-NEXT: [[TMP11:%.*]] = load i16, ptr addrspace(4) [[TMP10]], align 2, !invariant.load [[META1]], !noundef [[META1]] +; AMDGCN-NEXT: [[TMP4:%.*]] = zext i16 [[TMP11]] to i32 +; AMDGCN-NEXT: [[TMP5:%.*]] = udiv i32 [[TMP3]], [[TMP4]] +; AMDGCN-NEXT: ret i32 [[TMP5]] +; +; NVPTX-LABEL: @num_blocks_y( +; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.y() +; NVPTX-NEXT: ret i32 [[TMP1]] +; + %1 = call i32 @llvm.gpu.num.blocks.y() + ret i32 %1 +} + +declare i32 @llvm.gpu.num.blocks.y() + +define i32 @num_blocks_z() { +; AMDGCN-LABEL: @num_blocks_z( +; AMDGCN-NEXT: [[TMP1:%.*]] = call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() +; AMDGCN-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP1]], i32 20 +; AMDGCN-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[TMP2]], align 4, !range [[RNG0]], !invariant.load [[META1]] +; AMDGCN-NEXT: [[TMP12:%.*]] = load i32, ptr addrspace(4) @__oclc_ABI_version, align 4 +; AMDGCN-NEXT: [[TMP13:%.*]] = icmp sge i32 [[TMP12]], 500 +; AMDGCN-NEXT: [[TMP6:%.*]] = call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; AMDGCN-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP6]], i32 16 +; AMDGCN-NEXT: [[TMP8:%.*]] = call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() +; AMDGCN-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP8]], i32 8 +; AMDGCN-NEXT: [[TMP10:%.*]] = select i1 [[TMP13]], ptr addrspace(4) [[TMP7]], ptr addrspace(4) [[TMP9]] +; AMDGCN-NEXT: [[TMP11:%.*]] = load i16, ptr addrspace(4) [[TMP10]], align 2, !invariant.load [[META1]], !noundef [[META1]] +; AMDGCN-NEXT: [[TMP4:%.*]] = zext i16 [[TMP11]] to i32 +; AMDGCN-NEXT: [[TMP5:%.*]] = udiv i32 [[TMP3]], [[TMP4]] +; AMDGCN-NEXT: ret i32 [[TMP5]] +; +; NVPTX-LABEL: @num_blocks_z( +; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.z() +; NVPTX-NEXT: ret i32 [[TMP1]] +; + %1 = call i32 @llvm.gpu.num.blocks.z() + ret i32 %1 +} + +declare i32 @llvm.gpu.num.blocks.z() + +define i32 @block_id_x() { +; AMDGCN-LABEL: @block_id_x( +; AMDGCN-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x() +; AMDGCN-NEXT: ret i32 [[TMP1]] +; +; NVPTX-LABEL: @block_id_x( +; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() +; NVPTX-NEXT: ret i32 [[TMP1]] +; + %1 = call i32 @llvm.gpu.block.id.x() + ret i32 %1 +} + +declare i32 @llvm.gpu.block.id.x() + +define i32 @block_id_y() { +; AMDGCN-LABEL: @block_id_y( +; AMDGCN-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y() +; AMDGCN-NEXT: ret i32 [[TMP1]] +; +; NVPTX-LABEL: @block_id_y( +; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() +; NVPTX-NEXT: ret i32 [[TMP1]] +; + %1 = call i32 @llvm.gpu.block.id.y() + ret i32 %1 +} + +declare i32 @llvm.gpu.block.id.y() + +define i32 @block_id_z() { +; AMDGCN-LABEL: @block_id_z( +; AMDGCN-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workgroup.id.z() +; AMDGCN-NEXT: ret i32 [[TMP1]] +; +; NVPTX-LABEL: @block_id_z( +; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.z() +; NVPTX-NEXT: ret i32 [[TMP1]] +; + %1 = call i32 @llvm.gpu.block.id.z() + ret i32 %1 +} + +declare i32 @llvm.gpu.block.id.z() + +define i32 @num_threads_x() { +; AMDGCN-LABEL: @num_threads_x( +; AMDGCN-NEXT: [[TMP9:%.*]] = load i32, ptr addrspace(4) @__oclc_ABI_version, align 4 +; AMDGCN-NEXT: [[TMP2:%.*]] = icmp sge i32 [[TMP9]], 500 +; AMDGCN-NEXT: [[TMP3:%.*]] = call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; AMDGCN-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP3]], i32 12 +; AMDGCN-NEXT: [[TMP5:%.*]] = call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() +; AMDGCN-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP5]], i32 4 +; AMDGCN-NEXT: [[TMP7:%.*]] = select i1 [[TMP2]], ptr addrspace(4) [[TMP4]], ptr addrspace(4) [[TMP6]] +; AMDGCN-NEXT: [[TMP8:%.*]] = load i16, ptr addrspace(4) [[TMP7]], align 2, !invariant.load [[META1]], !noundef [[META1]] +; AMDGCN-NEXT: [[TMP1:%.*]] = zext i16 [[TMP8]] to i32 +; AMDGCN-NEXT: ret i32 [[TMP1]] +; +; NVPTX-LABEL: @num_threads_x( +; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +; NVPTX-NEXT: ret i32 [[TMP1]] +; + %1 = call i32 @llvm.gpu.num.threads.x() + ret i32 %1 +} + +declare i32 @llvm.gpu.num.threads.x() + +define i32 @num_threads_y() { +; AMDGCN-LABEL: @num_threads_y( +; AMDGCN-NEXT: [[TMP9:%.*]] = load i32, ptr addrspace(4) @__oclc_ABI_version, align 4 +; AMDGCN-NEXT: [[TMP2:%.*]] = icmp sge i32 [[TMP9]], 500 +; AMDGCN-NEXT: [[TMP3:%.*]] = call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; AMDGCN-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP3]], i32 14 +; AMDGCN-NEXT: [[TMP5:%.*]] = call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() +; AMDGCN-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP5]], i32 6 +; AMDGCN-NEXT: [[TMP7:%.*]] = select i1 [[TMP2]], ptr addrspace(4) [[TMP4]], ptr addrspace(4) [[TMP6]] +; AMDGCN-NEXT: [[TMP8:%.*]] = load i16, ptr addrspace(4) [[TMP7]], align 2, !invariant.load [[META1]], !noundef [[META1]] +; AMDGCN-NEXT: [[TMP1:%.*]] = zext i16 [[TMP8]] to i32 +; AMDGCN-NEXT: ret i32 [[TMP1]] +; +; NVPTX-LABEL: @num_threads_y( +; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.y() +; NVPTX-NEXT: ret i32 [[TMP1]] +; + %1 = call i32 @llvm.gpu.num.threads.y() + ret i32 %1 +} + +declare i32 @llvm.gpu.num.threads.y() + +define i32 @num_threads_z() { +; AMDGCN-LABEL: @num_threads_z( +; AMDGCN-NEXT: [[TMP9:%.*]] = load i32, ptr addrspace(4) @__oclc_ABI_version, align 4 +; AMDGCN-NEXT: [[TMP2:%.*]] = icmp sge i32 [[TMP9]], 500 +; AMDGCN-NEXT: [[TMP3:%.*]] = call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; AMDGCN-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP3]], i32 16 +; AMDGCN-NEXT: [[TMP5:%.*]] = call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() +; AMDGCN-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP5]], i32 8 +; AMDGCN-NEXT: [[TMP7:%.*]] = select i1 [[TMP2]], ptr addrspace(4) [[TMP4]], ptr addrspace(4) [[TMP6]] +; AMDGCN-NEXT: [[TMP8:%.*]] = load i16, ptr addrspace(4) [[TMP7]], align 2, !invariant.load [[META1]], !noundef [[META1]] +; AMDGCN-NEXT: [[TMP1:%.*]] = zext i16 [[TMP8]] to i32 +; AMDGCN-NEXT: ret i32 [[TMP1]] +; +; NVPTX-LABEL: @num_threads_z( +; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.z() +; NVPTX-NEXT: ret i32 [[TMP1]] +; + %1 = call i32 @llvm.gpu.num.threads.z() + ret i32 %1 +} + +declare i32 @llvm.gpu.num.threads.z() + +define i32 @thread_id_x() { +; AMDGCN-LABEL: @thread_id_x( +; AMDGCN-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; AMDGCN-NEXT: ret i32 [[TMP1]] +; +; NVPTX-LABEL: @thread_id_x( +; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +; NVPTX-NEXT: ret i32 [[TMP1]] +; + %1 = call i32 @llvm.gpu.thread.id.x() + ret i32 %1 +} + +declare i32 @llvm.gpu.thread.id.x() + +define i32 @thread_id_y() { +; AMDGCN-LABEL: @thread_id_y( +; AMDGCN-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() +; AMDGCN-NEXT: ret i32 [[TMP1]] +; +; NVPTX-LABEL: @thread_id_y( +; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.y() +; NVPTX-NEXT: ret i32 [[TMP1]] +; + %1 = call i32 @llvm.gpu.thread.id.y() + ret i32 %1 +} + +declare i32 @llvm.gpu.thread.id.y() + +define i32 @thread_id_z() { +; AMDGCN-LABEL: @thread_id_z( +; AMDGCN-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.z() +; AMDGCN-NEXT: ret i32 [[TMP1]] +; +; NVPTX-LABEL: @thread_id_z( +; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.z() +; NVPTX-NEXT: ret i32 [[TMP1]] +; + %1 = call i32 @llvm.gpu.thread.id.z() + ret i32 %1 +} + +declare i32 @llvm.gpu.thread.id.z() + +define i32 @num_lanes() { +; AMDGCN-LABEL: @num_lanes( +; AMDGCN-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.wavefrontsize() +; AMDGCN-NEXT: ret i32 [[TMP1]] +; +; NVPTX-LABEL: @num_lanes( +; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +; NVPTX-NEXT: ret i32 [[TMP1]] +; + %1 = call i32 @llvm.gpu.num.lanes() + ret i32 %1 +} + +declare i32 @llvm.gpu.num.lanes() + +define i32 @lane_id() { +; AMDGCN-LABEL: @lane_id( +; AMDGCN-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) +; AMDGCN-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[TMP1]]) +; AMDGCN-NEXT: ret i32 [[TMP2]] +; +; NVPTX-LABEL: @lane_id( +; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.laneid() +; NVPTX-NEXT: ret i32 [[TMP1]] +; + %1 = call i32 @llvm.gpu.lane.id() + ret i32 %1 +} + +declare i32 @llvm.gpu.lane.id() + +define i64 @lane_mask() { +; AMDGCN-LABEL: @lane_mask( +; AMDGCN-NEXT: [[TMP1:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; AMDGCN-NEXT: ret i64 [[TMP1]] +; +; NVPTX-LABEL: @lane_mask( +; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @llvm.nvvm.activemask() +; NVPTX-NEXT: [[CONV:%.*]] = zext i32 [[TMP1]] to i64 +; NVPTX-NEXT: ret i64 [[CONV]] +; + %1 = call i64 @llvm.gpu.lane.mask() + ret i64 %1 +} + +declare i64 @llvm.gpu.lane.mask() + +define i32 @read_first_lane_u32(i64 %lane_mask, i32 %x) { +; AMDGCN-LABEL: @read_first_lane_u32( +; AMDGCN-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[X:%.*]]) +; AMDGCN-NEXT: ret i32 [[TMP1]] +; +; NVPTX-LABEL: @read_first_lane_u32( +; NVPTX-NEXT: [[CONV:%.*]] = trunc i64 [[LANE_MASK:%.*]] to i32 +; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @llvm.cttz.i32(i32 [[CONV]], i1 true) +; NVPTX-NEXT: [[ISZERO:%.*]] = icmp eq i32 [[CONV]], 0 +; NVPTX-NEXT: [[SUB:%.*]] = select i1 [[ISZERO]], i32 -1, i32 [[TMP1]] +; NVPTX-NEXT: [[TMP2:%.*]] = call i32 @llvm.nvvm.shfl.sync.idx.i32(i32 [[CONV]], i32 [[X:%.*]], i32 [[SUB]], i32 31) +; NVPTX-NEXT: ret i32 [[TMP2]] +; + %1 = call i32 @llvm.gpu.read.first.lane.u32(i64 %lane_mask, i32 %x) + ret i32 %1 +} + +declare i32 @llvm.gpu.read.first.lane.u32(i64, i32) + +define i64 @ballot(i64 %lane_mask, i1 zeroext %x) { +; AMDGCN-LABEL: @ballot( +; AMDGCN-NEXT: [[TMP1:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[X:%.*]]) +; AMDGCN-NEXT: [[TMP2:%.*]] = and i64 [[TMP1]], [[LANE_MASK:%.*]] +; AMDGCN-NEXT: ret i64 [[TMP2]] +; +; NVPTX-LABEL: @ballot( +; NVPTX-NEXT: [[CONV:%.*]] = trunc i64 [[LANE_MASK:%.*]] to i32 +; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @llvm.nvvm.vote.ballot.sync(i32 [[CONV]], i1 [[X:%.*]]) +; NVPTX-NEXT: [[CONV1:%.*]] = zext i32 [[TMP1]] to i64 +; NVPTX-NEXT: ret i64 [[CONV1]] +; + %1 = call i64 @llvm.gpu.ballot(i64 %lane_mask, i1 %x) + ret i64 %1 +} + +declare i64 @llvm.gpu.ballot(i64, i1) + +define void @sync_threads() { +; AMDGCN-LABEL: @sync_threads( +; AMDGCN-NEXT: call void @llvm.amdgcn.s.barrier() +; AMDGCN-NEXT: fence syncscope("workgroup") seq_cst +; AMDGCN-NEXT: ret void +; +; NVPTX-LABEL: @sync_threads( +; NVPTX-NEXT: call void @llvm.nvvm.barrier0() +; NVPTX-NEXT: ret void +; + call void @llvm.gpu.sync.threads() + ret void +} + +declare void @llvm.gpu.sync.threads() + +define void @sync_lane(i64 %lane_mask) { +; AMDGCN-LABEL: @sync_lane( +; AMDGCN-NEXT: call void @llvm.amdgcn.wave.barrier() +; AMDGCN-NEXT: ret void +; +; NVPTX-LABEL: @sync_lane( +; NVPTX-NEXT: [[TMP1:%.*]] = trunc i64 [[LANE_MASK:%.*]] to i32 +; NVPTX-NEXT: call void @llvm.nvvm.bar.warp.sync(i32 [[TMP1]]) +; NVPTX-NEXT: ret void +; + call void @llvm.gpu.sync.lane(i64 %lane_mask) + ret void +} + +declare void @llvm.gpu.sync.lane(i64) + +define i32 @shuffle_idx_u32(i64 %lane_mask, i32 %idx, i32 %x, i32 %width) { +; AMDGCN-LABEL: @shuffle_idx_u32( +; AMDGCN-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) +; AMDGCN-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[TMP1]]) +; AMDGCN-NEXT: [[NOT:%.*]] = sub i32 0, [[WIDTH:%.*]] +; AMDGCN-NEXT: [[AND:%.*]] = and i32 [[TMP2]], [[NOT]] +; AMDGCN-NEXT: [[ADD:%.*]] = add i32 [[AND]], [[IDX:%.*]] +; AMDGCN-NEXT: [[SHL:%.*]] = shl i32 [[ADD]], 2 +; AMDGCN-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.ds.bpermute(i32 [[SHL]], i32 [[X:%.*]]) +; AMDGCN-NEXT: ret i32 [[TMP3]] +; +; NVPTX-LABEL: @shuffle_idx_u32( +; NVPTX-NEXT: [[CONV:%.*]] = trunc i64 [[LANE_MASK:%.*]] to i32 +; NVPTX-NEXT: [[SH_PROM:%.*]] = zext i32 [[IDX:%.*]] to i64 +; NVPTX-NEXT: [[TMP1:%.*]] = shl i32 [[WIDTH:%.*]], 8 +; NVPTX-NEXT: [[OR:%.*]] = sub i32 8223, [[TMP1]] +; NVPTX-NEXT: [[TMP2:%.*]] = call i32 @llvm.nvvm.shfl.sync.idx.i32(i32 [[CONV]], i32 [[X:%.*]], i32 [[IDX]], i32 [[OR]]) +; NVPTX-NEXT: [[TMP3:%.*]] = shl i64 1, [[SH_PROM]] +; NVPTX-NEXT: [[TMP4:%.*]] = and i64 [[TMP3]], [[LANE_MASK]] +; NVPTX-NEXT: [[TMP5:%.*]] = icmp eq i64 [[TMP4]], 0 +; NVPTX-NEXT: [[AND4:%.*]] = select i1 [[TMP5]], i32 0, i32 [[TMP2]] +; NVPTX-NEXT: ret i32 [[AND4]] +; + %1 = call i32 @llvm.gpu.shuffle.idx.u32(i64 %lane_mask, i32 %idx, i32 %x, i32 %width) + ret i32 %1 +} + +declare i32 @llvm.gpu.shuffle.idx.u32(i64, i32, i32, i32) + +define void @gpu_exit() { +; AMDGCN-LABEL: @gpu_exit( +; AMDGCN-NEXT: call void @llvm.amdgcn.endpgm() +; AMDGCN-NEXT: ret void +; +; NVPTX-LABEL: @gpu_exit( +; NVPTX-NEXT: call void @llvm.nvvm.exit() +; NVPTX-NEXT: ret void +; + call void @llvm.gpu.exit() + ret void +} + +declare void @llvm.gpu.exit() + +define void @thread_suspend() { +; AMDGCN-LABEL: @thread_suspend( +; AMDGCN-NEXT: call void @llvm.amdgcn.s.sleep(i32 2) +; AMDGCN-NEXT: ret void +; +; NVPTX-LABEL: @thread_suspend( +; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @llvm.nvvm.reflect(ptr @[[GLOB0:[0-9]+]]) +; NVPTX-NEXT: [[TMP2:%.*]] = icmp ugt i32 [[TMP1]], 699 +; NVPTX-NEXT: br i1 [[TMP2]], label [[TMP3:%.*]], label [[TMP4:%.*]] +; NVPTX: 3: +; NVPTX-NEXT: call void @llvm.nvvm.nanosleep(i32 64) +; NVPTX-NEXT: br label [[TMP4]] +; NVPTX: 4: +; NVPTX-NEXT: ret void +; + call void @llvm.gpu.thread.suspend() + ret void +} + +declare void @llvm.gpu.thread.suspend() _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits