[clang] [llvm] [SPIRV] GPU intrinsics (PR #131190)

Jon Chesterfield via cfe-commits Thu, 13 Mar 2025 11:57:47 -0700

https://github.com/JonChesterfield updated 
https://github.com/llvm/llvm-project/pull/131190


>From b52a04c55ad56e1172dec6262f2536ec3fe7162b Mon Sep 17 00:00:00 2001
From: Jon Chesterfield <jonathanchesterfi...@gmail.com>
Date: Wed, 12 Mar 2025 20:55:17 +0000
Subject: [PATCH] [SPIRV] GPU intrinsics

---
 clang/include/clang/Basic/Builtins.td         |  29 +
 clang/lib/Headers/amdgpuintrin.h              |   2 +-
 clang/lib/Headers/gpuintrin.h                 |   2 +
 clang/lib/Headers/spirvintrin.h               | 182 +++++
 clang/test/CodeGen/amdgpu-grid-builtins.c     | 158 +++++
 clang/test/CodeGen/gpu_builtins.c             | 647 ++++++++++++++++++
 clang/test/Headers/gpuintrin.c                | 223 ++++++
 llvm/include/llvm/IR/Intrinsics.td            |  63 ++
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td      |   2 +
 llvm/include/llvm/InitializePasses.h          |   1 +
 llvm/include/llvm/Transforms/Scalar.h         |   6 +
 .../Transforms/Scalar/LowerGPUIntrinsic.h     |  26 +
 llvm/lib/Passes/PassBuilder.cpp               |   1 +
 llvm/lib/Passes/PassRegistry.def              |   1 +
 llvm/lib/Transforms/Scalar/CMakeLists.txt     |   1 +
 .../Transforms/Scalar/LowerGPUIntrinsic.cpp   | 501 ++++++++++++++
 llvm/lib/Transforms/Scalar/Scalar.cpp         |   1 +
 llvm/test/CodeGen/SPIRV/gpu_intrinsics.ll     | 427 ++++++++++++
 18 files changed, 2272 insertions(+), 1 deletion(-)
 create mode 100644 clang/lib/Headers/spirvintrin.h
 create mode 100644 clang/test/CodeGen/amdgpu-grid-builtins.c
 create mode 100644 clang/test/CodeGen/gpu_builtins.c
 create mode 100644 llvm/include/llvm/Transforms/Scalar/LowerGPUIntrinsic.h
 create mode 100644 llvm/lib/Transforms/Scalar/LowerGPUIntrinsic.cpp
 create mode 100644 llvm/test/CodeGen/SPIRV/gpu_intrinsics.ll

diff --git a/clang/include/clang/Basic/Builtins.td 
b/clang/include/clang/Basic/Builtins.td
index 2fbdfaea57ccd..042508c1e59a8 100644
--- a/clang/include/clang/Basic/Builtins.td
+++ b/clang/include/clang/Basic/Builtins.td
@@ -4770,6 +4770,35 @@ def GetDeviceSideMangledName : LangBuiltin<"CUDA_LANG"> {
   let Prototype = "char const*(...)";
 }
 
+// GPU intrinsics
+class GPUBuiltin<string prototype> : Builtin {
+  let Spellings = ["__builtin_" # NAME];
+  let Prototype = prototype;
+  let Attributes = [NoThrow];
+}
+
+multiclass GPUGridBuiltin<string prototype> {
+  def _x : GPUBuiltin<prototype>;
+  def _y : GPUBuiltin<prototype>;
+  def _z : GPUBuiltin<prototype>;
+}
+
+defm gpu_num_blocks : GPUGridBuiltin<"uint32_t()">;
+defm gpu_block_id : GPUGridBuiltin<"uint32_t()">;
+defm gpu_num_threads : GPUGridBuiltin<"uint32_t()">;
+defm gpu_thread_id : GPUGridBuiltin<"uint32_t()">;
+
+def gpu_ballot : GPUBuiltin<"uint64_t(uint64_t, bool)">;
+def gpu_exit : GPUBuiltin<"void()">;
+def gpu_lane_id : GPUBuiltin<"uint32_t()">;
+def gpu_lane_mask : GPUBuiltin<"uint64_t()">;
+def gpu_num_lanes : GPUBuiltin<"uint32_t()">;
+def gpu_read_first_lane_u32 : GPUBuiltin<"uint32_t(uint64_t, uint32_t)">;
+def gpu_shuffle_idx_u32 : GPUBuiltin<"uint32_t(uint64_t, uint32_t, uint32_t, 
uint32_t)">;
+def gpu_sync_lane : GPUBuiltin<"void(uint64_t)">;
+def gpu_sync_threads : GPUBuiltin<"void()">;
+def gpu_thread_suspend : GPUBuiltin<"void()">;
+
 // HLSL
 def HLSLAddUint64: LangBuiltin<"HLSL_LANG"> {
   let Spellings = ["__builtin_hlsl_adduint64"];
diff --git a/clang/lib/Headers/amdgpuintrin.h b/clang/lib/Headers/amdgpuintrin.h
index f7fb8e2814180..817cfeec896c4 100644
--- a/clang/lib/Headers/amdgpuintrin.h
+++ b/clang/lib/Headers/amdgpuintrin.h
@@ -1,4 +1,4 @@
-//===-- amdgpuintrin.h - AMDPGU intrinsic functions 
-----------------------===//
+//===-- amdgpuintrin.h - AMDGPU intrinsic functions 
-----------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang/lib/Headers/gpuintrin.h b/clang/lib/Headers/gpuintrin.h
index 0fb3916acac61..a3ce535188a48 100644
--- a/clang/lib/Headers/gpuintrin.h
+++ b/clang/lib/Headers/gpuintrin.h
@@ -60,6 +60,8 @@ _Pragma("omp end declare target");
 #include <nvptxintrin.h>
 #elif defined(__AMDGPU__)
 #include <amdgpuintrin.h>
+#elif defined(__SPIRV64__)
+#include <spirvintrin.h>
 #elif !defined(_OPENMP)
 #error "This header is only meant to be used on GPU architectures."
 #endif
diff --git a/clang/lib/Headers/spirvintrin.h b/clang/lib/Headers/spirvintrin.h
new file mode 100644
index 0000000000000..3f40ad156d5ce
--- /dev/null
+++ b/clang/lib/Headers/spirvintrin.h
@@ -0,0 +1,182 @@
+//===-- spirvintrin.h - SPIRV intrinsic functions ------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __SPIRVINTRIN_H
+#define __SPIRVINTRIN_H
+
+#ifndef __SPIRV64__
+// 32 bit SPIRV is currently a stretch goal
+#error "This file is intended for SPIRV64 targets or offloading to SPIRV64"
+#endif
+
+#ifndef __GPUINTRIN_H
+#error "Never use <spirvintrin.h> directly; include <gpuintrin.h> instead"
+#endif
+
+// This is the skeleton of the spirv implementation for gpuintrin
+// Address spaces and kernel attribute are not yet implemented
+
+#if defined(_OPENMP)
+#error "Openmp is not yet available on spirv though gpuintrin header"
+#endif
+
+// Type aliases to the address spaces used by the SPIRV backend.
+#define __gpu_private
+#define __gpu_constant
+#define __gpu_local
+#define __gpu_global
+#define __gpu_generic
+
+// Attribute to declare a function as a kernel.
+#define __gpu_kernel
+
+// Note, because the builtin_gpu intrinsics lower to amdgcn or nvptx on request
+// the following implementations of these functions would work equally well
+// in the amdgcnintrin.h or nvptxintrin.h headers, i.e. we could move this
+// definition of __gpu_num_blocks_x et al into gpuintrin.h and remove them
+// from the three target intrin.h headers.
+
+// Returns the number of workgroups in the 'x' dimension of the grid.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_x(void) {
+  return __builtin_gpu_num_blocks_x();
+}
+
+// Returns the number of workgroups in the 'y' dimension of the grid.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_y(void) {
+  return __builtin_gpu_num_blocks_y();
+}
+
+// Returns the number of workgroups in the 'z' dimension of the grid.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_z(void) {
+  return __builtin_gpu_num_blocks_z();
+}
+
+// Returns the 'x' dimension of the current AMD workgroup's id.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_x(void) {
+  return __builtin_gpu_block_id_x();
+}
+
+// Returns the 'y' dimension of the current AMD workgroup's id.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_y(void) {
+  return __builtin_gpu_block_id_y();
+}
+
+// Returns the 'z' dimension of the current AMD workgroup's id.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_z(void) {
+  return __builtin_gpu_block_id_z();
+}
+
+// Returns the number of workitems in the 'x' dimension.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_x(void) {
+  return __builtin_gpu_num_threads_x();
+}
+
+// Returns the number of workitems in the 'y' dimension.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_y(void) {
+  return __builtin_gpu_num_threads_y();
+}
+
+// Returns the number of workitems in the 'z' dimension.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_z(void) {
+  return __builtin_gpu_num_threads_z();
+}
+
+// Returns the 'x' dimension id of the workitem in the current workgroup.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_x(void) {
+  return __builtin_gpu_thread_id_x();
+}
+
+// Returns the 'y' dimension id of the workitem in the current workgroup.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_y(void) {
+  return __builtin_gpu_thread_id_y();
+}
+
+// Returns the 'z' dimension id of the workitem in the current workgroup.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_z(void) {
+  return __builtin_gpu_thread_id_z();
+}
+
+// Returns the size of the wave.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_lanes(void) {
+  return __builtin_gpu_num_lanes();
+}
+
+// Returns the id of the thread inside of a wave executing together.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_lane_id(void) {
+  return __builtin_gpu_lane_id();
+}
+
+// Returns the bit-mask of active threads in the current wave.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_lane_mask(void) {
+  return __builtin_gpu_lane_mask();
+}
+
+// Copies the value from the first active thread in the wave to the rest.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t
+__gpu_read_first_lane_u32(uint64_t __lane_mask, uint32_t __x) {
+  return __builtin_gpu_read_first_lane_u32(__lane_mask, __x);
+}
+
+// Returns a bitmask of threads in the current lane for which \p x is true.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_ballot(uint64_t __lane_mask,
+                                                          bool __x) {
+  return __builtin_gpu_ballot(__lane_mask, __x);
+}
+
+// Waits for all the threads in the block to converge and issues a fence.
+_DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_threads(void) {
+  return __builtin_gpu_sync_threads();
+}
+
+// Wait for all threads in the wave to converge
+_DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_lane(uint64_t __lane_mask) 
{
+  return __builtin_gpu_sync_lane(__lane_mask);
+}
+
+// Shuffles the the lanes inside the wave according to the given index.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t
+__gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x,
+                      uint32_t __width) {
+  return __builtin_gpu_shuffle_idx_u32(__lane_mask, __idx, __x, __width);
+}
+
+// Returns a bitmask marking all lanes that have the same value of __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_any_u32(uint64_t __lane_mask, uint32_t __x) {
+  return __gpu_match_any_u32_impl(__lane_mask, __x);
+}
+
+// Returns a bitmask marking all lanes that have the same value of __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_any_u64(uint64_t __lane_mask, uint64_t __x) {
+  return __gpu_match_any_u64_impl(__lane_mask, __x);
+}
+
+// Returns the current lane mask if every lane contains __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_all_u32(uint64_t __lane_mask, uint32_t __x) {
+  return __gpu_match_all_u32_impl(__lane_mask, __x);
+}
+
+// Returns the current lane mask if every lane contains __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_all_u64(uint64_t __lane_mask, uint64_t __x) {
+  return __gpu_match_all_u64_impl(__lane_mask, __x);
+}
+
+// Terminates execution of the associated wave.
+_DEFAULT_FN_ATTRS [[noreturn]] static __inline__ void __gpu_exit(void) {
+  return __builtin_gpu_exit();
+}
+
+// Suspend the thread briefly to assist the scheduler during busy loops.
+_DEFAULT_FN_ATTRS static __inline__ void __gpu_thread_suspend(void) {
+  return __builtin_gpu_thread_suspend();
+}
+
+#endif // __SPIRVINTRIN_H
diff --git a/clang/test/CodeGen/amdgpu-grid-builtins.c 
b/clang/test/CodeGen/amdgpu-grid-builtins.c
new file mode 100644
index 0000000000000..2104da2dc3cbc
--- /dev/null
+++ b/clang/test/CodeGen/amdgpu-grid-builtins.c
@@ -0,0 +1,158 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py 
UTC_ARGS: --version 5
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -emit-llvm -O1 %s -o - | 
FileCheck %s
+
+#include <stdint.h>
+
+// CHECK-LABEL: define dso_local noundef i32 @workgroup_id_x(
+// CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.amdgcn.workgroup.id.x()
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+uint32_t workgroup_id_x(void)
+{
+  return __builtin_amdgcn_workgroup_id_x();
+}
+
+// CHECK-LABEL: define dso_local noundef i32 @workgroup_id_y(
+// CHECK-SAME: ) local_unnamed_addr #[[ATTR2:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.amdgcn.workgroup.id.y()
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+uint32_t workgroup_id_y(void)
+{
+  return __builtin_amdgcn_workgroup_id_y();
+}
+
+// CHECK-LABEL: define dso_local noundef i32 @workgroup_id_z(
+// CHECK-SAME: ) local_unnamed_addr #[[ATTR3:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.amdgcn.workgroup.id.z()
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+uint32_t workgroup_id_z(void)
+{
+  return __builtin_amdgcn_workgroup_id_z();
+}
+
+// CHECK-LABEL: define dso_local noundef range(i32 0, 1024) i32 @workitem_id_x(
+// CHECK-SAME: ) local_unnamed_addr #[[ATTR4:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call noundef range(i32 0, 1024) i32 
@llvm.amdgcn.workitem.id.x()
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+uint32_t workitem_id_x(void)
+{
+  return __builtin_amdgcn_workitem_id_x();
+}
+
+// CHECK-LABEL: define dso_local noundef range(i32 0, 1024) i32 @workitem_id_y(
+// CHECK-SAME: ) local_unnamed_addr #[[ATTR5:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call noundef range(i32 0, 1024) i32 
@llvm.amdgcn.workitem.id.y()
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+uint32_t workitem_id_y(void)
+{
+  return __builtin_amdgcn_workitem_id_y();
+}
+
+// CHECK-LABEL: define dso_local noundef range(i32 0, 1024) i32 @workitem_id_z(
+// CHECK-SAME: ) local_unnamed_addr #[[ATTR6:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call noundef range(i32 0, 1024) i32 
@llvm.amdgcn.workitem.id.z()
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+uint32_t workitem_id_z(void)
+{
+  return __builtin_amdgcn_workitem_id_z();
+}
+
+// CHECK-LABEL: define dso_local range(i32 1, 1025) i32 @workgroup_size_x(
+// CHECK-SAME: ) local_unnamed_addr #[[ATTR7:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call align 8 dereferenceable(256) ptr 
addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr 
addrspace(4) [[TMP0]], i64 12
+// CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr addrspace(4) [[TMP1]], align 4, 
!range [[RNG3:![0-9]+]], !invariant.load [[META4:![0-9]+]], !noundef [[META4]]
+// CHECK-NEXT:    [[CONV:%.*]] = zext nneg i16 [[TMP2]] to i32
+// CHECK-NEXT:    ret i32 [[CONV]]
+//
+uint32_t workgroup_size_x(void)
+{
+  return __builtin_amdgcn_workgroup_size_x();
+}
+
+// CHECK-LABEL: define dso_local range(i32 1, 1025) i32 @workgroup_size_y(
+// CHECK-SAME: ) local_unnamed_addr #[[ATTR7]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call align 8 dereferenceable(256) ptr 
addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr 
addrspace(4) [[TMP0]], i64 14
+// CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr addrspace(4) [[TMP1]], align 2, 
!range [[RNG3]], !invariant.load [[META4]], !noundef [[META4]]
+// CHECK-NEXT:    [[CONV:%.*]] = zext nneg i16 [[TMP2]] to i32
+// CHECK-NEXT:    ret i32 [[CONV]]
+//
+uint32_t workgroup_size_y(void)
+{
+  return __builtin_amdgcn_workgroup_size_y();
+}
+
+// CHECK-LABEL: define dso_local range(i32 1, 1025) i32 @workgroup_size_z(
+// CHECK-SAME: ) local_unnamed_addr #[[ATTR7]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call align 8 dereferenceable(256) ptr 
addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr 
addrspace(4) [[TMP0]], i64 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr addrspace(4) [[TMP1]], align 8, 
!range [[RNG3]], !invariant.load [[META4]], !noundef [[META4]]
+// CHECK-NEXT:    [[CONV:%.*]] = zext nneg i16 [[TMP2]] to i32
+// CHECK-NEXT:    ret i32 [[CONV]]
+//
+uint32_t workgroup_size_z(void)
+{
+  return __builtin_amdgcn_workgroup_size_z();
+}
+
+// CHECK-LABEL: define dso_local range(i32 1, 0) i32 @grid_size_x(
+// CHECK-SAME: ) local_unnamed_addr #[[ATTR8:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call align 4 dereferenceable(64) ptr 
addrspace(4) @llvm.amdgcn.dispatch.ptr()
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr 
addrspace(4) [[TMP0]], i64 12
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TMP1]], align 4, 
!range [[RNG5:![0-9]+]], !invariant.load [[META4]]
+// CHECK-NEXT:    ret i32 [[TMP2]]
+//
+uint32_t grid_size_x(void)
+{
+  return __builtin_amdgcn_grid_size_x();
+}
+
+// CHECK-LABEL: define dso_local range(i32 1, 0) i32 @grid_size_y(
+// CHECK-SAME: ) local_unnamed_addr #[[ATTR8]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call align 4 dereferenceable(64) ptr 
addrspace(4) @llvm.amdgcn.dispatch.ptr()
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr 
addrspace(4) [[TMP0]], i64 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TMP1]], align 4, 
!range [[RNG5]], !invariant.load [[META4]]
+// CHECK-NEXT:    ret i32 [[TMP2]]
+//
+uint32_t grid_size_y(void)
+{
+  return __builtin_amdgcn_grid_size_y();
+}
+
+// CHECK-LABEL: define dso_local range(i32 1, 0) i32 @grid_size_z(
+// CHECK-SAME: ) local_unnamed_addr #[[ATTR8]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call align 4 dereferenceable(64) ptr 
addrspace(4) @llvm.amdgcn.dispatch.ptr()
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr 
addrspace(4) [[TMP0]], i64 20
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TMP1]], align 4, 
!range [[RNG5]], !invariant.load [[META4]]
+// CHECK-NEXT:    ret i32 [[TMP2]]
+//
+uint32_t grid_size_z(void)
+{
+  return __builtin_amdgcn_grid_size_z();
+}
+
+//.
+// CHECK: [[RNG3]] = !{i16 1, i16 1025}
+// CHECK: [[META4]] = !{}
+// CHECK: [[RNG5]] = !{i32 1, i32 0}
+//.
diff --git a/clang/test/CodeGen/gpu_builtins.c 
b/clang/test/CodeGen/gpu_builtins.c
new file mode 100644
index 0000000000000..8231b0952fa5e
--- /dev/null
+++ b/clang/test/CodeGen/gpu_builtins.c
@@ -0,0 +1,647 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py 
UTC_ARGS: --check-attributes
+// RUN: %clang_cc1 -O1 -triple spirv64 %s -emit-llvm -o - | FileCheck %s 
--check-prefix=SPIRV64
+// RUN: %clang_cc1 -O1 -triple spirv64-amd-amdhsa %s -emit-llvm -o - | 
FileCheck %s --check-prefix=AMDHSA
+// RUN: %clang_cc1 -O1 -triple nvptx64 -emit-llvm %s -o - | FileCheck %s 
--check-prefix=NVPTX
+// RUN: %clang_cc1 -O1 -triple amdgcn -emit-llvm %s -o - | FileCheck %s 
--check-prefix=AMDGCN
+
+#include <stdint.h>
+
+// SPIRV64: Function Attrs: mustprogress nofree norecurse nosync nounwind 
willreturn memory(none)
+// SPIRV64-LABEL: @num_blocks_x(
+// SPIRV64-NEXT:  entry:
+// SPIRV64-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.gpu.num.blocks.x()
+// SPIRV64-NEXT:    ret i32 [[TMP0]]
+//
+// AMDHSA: Function Attrs: mustprogress nofree norecurse nosync nounwind 
willreturn memory(none)
+// AMDHSA-LABEL: @num_blocks_x(
+// AMDHSA-NEXT:  entry:
+// AMDHSA-NEXT:    [[TMP0:%.*]] = tail call addrspace(4) i32 
@llvm.gpu.num.blocks.x()
+// AMDHSA-NEXT:    ret i32 [[TMP0]]
+//
+// NVPTX: Function Attrs: mustprogress nofree norecurse nosync nounwind 
willreturn memory(none)
+// NVPTX-LABEL: @num_blocks_x(
+// NVPTX-NEXT:  entry:
+// NVPTX-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.gpu.num.blocks.x()
+// NVPTX-NEXT:    ret i32 [[TMP0]]
+//
+// AMDGCN: Function Attrs: mustprogress nofree norecurse nosync nounwind 
willreturn memory(none)
+// AMDGCN-LABEL: @num_blocks_x(
+// AMDGCN-NEXT:  entry:
+// AMDGCN-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.gpu.num.blocks.x()
+// AMDGCN-NEXT:    ret i32 [[TMP0]]
+//
+uint32_t num_blocks_x(void) {
+  return __builtin_gpu_num_blocks_x();
+}
+
+
+// SPIRV64: Function Attrs: mustprogress nofree norecurse nosync nounwind 
willreturn memory(none)
+// SPIRV64-LABEL: @num_blocks_y(
+// SPIRV64-NEXT:  entry:
+// SPIRV64-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.gpu.num.blocks.y()
+// SPIRV64-NEXT:    ret i32 [[TMP0]]
+//
+// AMDHSA: Function Attrs: mustprogress nofree norecurse nosync nounwind 
willreturn memory(none)
+// AMDHSA-LABEL: @num_blocks_y(
+// AMDHSA-NEXT:  entry:
+// AMDHSA-NEXT:    [[TMP0:%.*]] = tail call addrspace(4) i32 
@llvm.gpu.num.blocks.y()
+// AMDHSA-NEXT:    ret i32 [[TMP0]]
+//
+// NVPTX: Function Attrs: mustprogress nofree norecurse nosync nounwind 
willreturn memory(none)
+// NVPTX-LABEL: @num_blocks_y(
+// NVPTX-NEXT:  entry:
+// NVPTX-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.gpu.num.blocks.y()
+// NVPTX-NEXT:    ret i32 [[TMP0]]
+//
+// AMDGCN: Function Attrs: mustprogress nofree norecurse nosync nounwind 
willreturn memory(none)
+// AMDGCN-LABEL: @num_blocks_y(
+// AMDGCN-NEXT:  entry:
+// AMDGCN-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.gpu.num.blocks.y()
+// AMDGCN-NEXT:    ret i32 [[TMP0]]
+//
+uint32_t num_blocks_y(void) {
+  return __builtin_gpu_num_blocks_y();
+}
+
+
+// SPIRV64: Function Attrs: mustprogress nofree norecurse nosync nounwind 
willreturn memory(none)
+// SPIRV64-LABEL: @num_blocks_z(
+// SPIRV64-NEXT:  entry:
+// SPIRV64-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.gpu.num.blocks.z()
+// SPIRV64-NEXT:    ret i32 [[TMP0]]
+//
+// AMDHSA: Function Attrs: mustprogress nofree norecurse nosync nounwind 
willreturn memory(none)
+// AMDHSA-LABEL: @num_blocks_z(
+// AMDHSA-NEXT:  entry:
+// AMDHSA-NEXT:    [[TMP0:%.*]] = tail call addrspace(4) i32 
@llvm.gpu.num.blocks.z()
+// AMDHSA-NEXT:    ret i32 [[TMP0]]
+//
+// NVPTX: Function Attrs: mustprogress nofree norecurse nosync nounwind 
willreturn memory(none)
+// NVPTX-LABEL: @num_blocks_z(
+// NVPTX-NEXT:  entry:
+// NVPTX-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.gpu.num.blocks.z()
+// NVPTX-NEXT:    ret i32 [[TMP0]]
+//
+// AMDGCN: Function Attrs: mustprogress nofree norecurse nosync nounwind 
willreturn memory(none)
+// AMDGCN-LABEL: @num_blocks_z(
+// AMDGCN-NEXT:  entry:
+// AMDGCN-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.gpu.num.blocks.z()
+// AMDGCN-NEXT:    ret i32 [[TMP0]]
+//
+uint32_t num_blocks_z(void) {
+  return __builtin_gpu_num_blocks_z();
+}
+
+
+// SPIRV64: Function Attrs: mustprogress nofree norecurse nosync nounwind 
willreturn memory(none)
+// SPIRV64-LABEL: @block_id_x(
+// SPIRV64-NEXT:  entry:
+// SPIRV64-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.gpu.block.id.x()
+// SPIRV64-NEXT:    ret i32 [[TMP0]]
+//
+// AMDHSA: Function Attrs: mustprogress nofree norecurse nosync nounwind 
willreturn memory(none)
+// AMDHSA-LABEL: @block_id_x(
+// AMDHSA-NEXT:  entry:
+// AMDHSA-NEXT:    [[TMP0:%.*]] = tail call addrspace(4) i32 
@llvm.gpu.block.id.x()
+// AMDHSA-NEXT:    ret i32 [[TMP0]]
+//
+// NVPTX: Function Attrs: mustprogress nofree norecurse nosync nounwind 
willreturn memory(none)
+// NVPTX-LABEL: @block_id_x(
+// NVPTX-NEXT:  entry:
+// NVPTX-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.gpu.block.id.x()
+// NVPTX-NEXT:    ret i32 [[TMP0]]
+//
+// AMDGCN: Function Attrs: mustprogress nofree norecurse nosync nounwind 
willreturn memory(none)
+// AMDGCN-LABEL: @block_id_x(
+// AMDGCN-NEXT:  entry:
+// AMDGCN-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.gpu.block.id.x()
+// AMDGCN-NEXT:    ret i32 [[TMP0]]
+//
+uint32_t block_id_x(void) {
+  return __builtin_gpu_block_id_x();
+}
+
+
+// SPIRV64: Function Attrs: mustprogress nofree norecurse nosync nounwind 
willreturn memory(none)
+// SPIRV64-LABEL: @block_id_y(
+// SPIRV64-NEXT:  entry:
+// SPIRV64-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.gpu.block.id.y()
+// SPIRV64-NEXT:    ret i32 [[TMP0]]
+//
+// AMDHSA: Function Attrs: mustprogress nofree norecurse nosync nounwind 
willreturn memory(none)
+// AMDHSA-LABEL: @block_id_y(
+// AMDHSA-NEXT:  entry:
+// AMDHSA-NEXT:    [[TMP0:%.*]] = tail call addrspace(4) i32 
@llvm.gpu.block.id.y()
+// AMDHSA-NEXT:    ret i32 [[TMP0]]
+//
+// NVPTX: Function Attrs: mustprogress nofree norecurse nosync nounwind 
willreturn memory(none)
+// NVPTX-LABEL: @block_id_y(
+// NVPTX-NEXT:  entry:
+// NVPTX-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.gpu.block.id.y()
+// NVPTX-NEXT:    ret i32 [[TMP0]]
+//
+// AMDGCN: Function Attrs: mustprogress nofree norecurse nosync nounwind 
willreturn memory(none)
+// AMDGCN-LABEL: @block_id_y(
+// AMDGCN-NEXT:  entry:
+// AMDGCN-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.gpu.block.id.y()
+// AMDGCN-NEXT:    ret i32 [[TMP0]]
+//
+uint32_t block_id_y(void) {
+  return __builtin_gpu_block_id_y();
+}
+
+
+// SPIRV64: Function Attrs: mustprogress nofree norecurse nosync nounwind 
willreturn memory(none)
+// SPIRV64-LABEL: @block_id_z(
+// SPIRV64-NEXT:  entry:
+// SPIRV64-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.gpu.block.id.z()
+// SPIRV64-NEXT:    ret i32 [[TMP0]]
+//
+// AMDHSA: Function Attrs: mustprogress nofree norecurse nosync nounwind 
willreturn memory(none)
+// AMDHSA-LABEL: @block_id_z(
+// AMDHSA-NEXT:  entry:
+// AMDHSA-NEXT:    [[TMP0:%.*]] = tail call addrspace(4) i32 
@llvm.gpu.block.id.z()
+// AMDHSA-NEXT:    ret i32 [[TMP0]]
+//
+// NVPTX: Function Attrs: mustprogress nofree norecurse nosync nounwind 
willreturn memory(none)
+// NVPTX-LABEL: @block_id_z(
+// NVPTX-NEXT:  entry:
+// NVPTX-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.gpu.block.id.z()
+// NVPTX-NEXT:    ret i32 [[TMP0]]
+//
+// AMDGCN: Function Attrs: mustprogress nofree norecurse nosync nounwind 
willreturn memory(none)
+// AMDGCN-LABEL: @block_id_z(
+// AMDGCN-NEXT:  entry:
+// AMDGCN-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.gpu.block.id.z()
+// AMDGCN-NEXT:    ret i32 [[TMP0]]
+//
+uint32_t block_id_z(void) {
+  return __builtin_gpu_block_id_z();
+}
+
+
+// SPIRV64: Function Attrs: mustprogress nofree norecurse nosync nounwind 
willreturn memory(none)
+// SPIRV64-LABEL: @num_threads_x(
+// SPIRV64-NEXT:  entry:
+// SPIRV64-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.gpu.num.threads.x()
+// SPIRV64-NEXT:    ret i32 [[TMP0]]
+//
+// AMDHSA: Function Attrs: mustprogress nofree norecurse nosync nounwind 
willreturn memory(none)
+// AMDHSA-LABEL: @num_threads_x(
+// AMDHSA-NEXT:  entry:
+// AMDHSA-NEXT:    [[TMP0:%.*]] = tail call addrspace(4) i32 
@llvm.gpu.num.threads.x()
+// AMDHSA-NEXT:    ret i32 [[TMP0]]
+//
+// NVPTX: Function Attrs: mustprogress nofree norecurse nosync nounwind 
willreturn memory(none)
+// NVPTX-LABEL: @num_threads_x(
+// NVPTX-NEXT:  entry:
+// NVPTX-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.gpu.num.threads.x()
+// NVPTX-NEXT:    ret i32 [[TMP0]]
+//
+// AMDGCN: Function Attrs: mustprogress nofree norecurse nosync nounwind 
willreturn memory(none)
+// AMDGCN-LABEL: @num_threads_x(
+// AMDGCN-NEXT:  entry:
+// AMDGCN-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.gpu.num.threads.x()
+// AMDGCN-NEXT:    ret i32 [[TMP0]]
+//
+uint32_t num_threads_x(void) {
+  return __builtin_gpu_num_threads_x();
+}
+
+
+// SPIRV64: Function Attrs: mustprogress nofree norecurse nosync nounwind 
willreturn memory(none)
+// SPIRV64-LABEL: @num_threads_y(
+// SPIRV64-NEXT:  entry:
+// SPIRV64-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.gpu.num.threads.y()
+// SPIRV64-NEXT:    ret i32 [[TMP0]]
+//
+// AMDHSA: Function Attrs: mustprogress nofree norecurse nosync nounwind 
willreturn memory(none)
+// AMDHSA-LABEL: @num_threads_y(
+// AMDHSA-NEXT:  entry:
+// AMDHSA-NEXT:    [[TMP0:%.*]] = tail call addrspace(4) i32 
@llvm.gpu.num.threads.y()
+// AMDHSA-NEXT:    ret i32 [[TMP0]]
+//
+// NVPTX: Function Attrs: mustprogress nofree norecurse nosync nounwind 
willreturn memory(none)
+// NVPTX-LABEL: @num_threads_y(
+// NVPTX-NEXT:  entry:
+// NVPTX-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.gpu.num.threads.y()
+// NVPTX-NEXT:    ret i32 [[TMP0]]
+//
+// AMDGCN: Function Attrs: mustprogress nofree norecurse nosync nounwind 
willreturn memory(none)
+// AMDGCN-LABEL: @num_threads_y(
+// AMDGCN-NEXT:  entry:
+// AMDGCN-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.gpu.num.threads.y()
+// AMDGCN-NEXT:    ret i32 [[TMP0]]
+//
+uint32_t num_threads_y(void) {
+  return __builtin_gpu_num_threads_y();
+}
+
+
+// SPIRV64: Function Attrs: mustprogress nofree norecurse nosync nounwind 
willreturn memory(none)
+// SPIRV64-LABEL: @num_threads_z(
+// SPIRV64-NEXT:  entry:
+// SPIRV64-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.gpu.num.threads.z()
+// SPIRV64-NEXT:    ret i32 [[TMP0]]
+//
+// AMDHSA: Function Attrs: mustprogress nofree norecurse nosync nounwind 
willreturn memory(none)
+// AMDHSA-LABEL: @num_threads_z(
+// AMDHSA-NEXT:  entry:
+// AMDHSA-NEXT:    [[TMP0:%.*]] = tail call addrspace(4) i32 
@llvm.gpu.num.threads.z()
+// AMDHSA-NEXT:    ret i32 [[TMP0]]
+//
+// NVPTX: Function Attrs: mustprogress nofree norecurse nosync nounwind 
willreturn memory(none)
+// NVPTX-LABEL: @num_threads_z(
+// NVPTX-NEXT:  entry:
+// NVPTX-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.gpu.num.threads.z()
+// NVPTX-NEXT:    ret i32 [[TMP0]]
+//
+// AMDGCN: Function Attrs: mustprogress nofree norecurse nosync nounwind 
willreturn memory(none)
+// AMDGCN-LABEL: @num_threads_z(
+// AMDGCN-NEXT:  entry:
+// AMDGCN-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.gpu.num.threads.z()
+// AMDGCN-NEXT:    ret i32 [[TMP0]]
+//
+uint32_t num_threads_z(void) {
+  return __builtin_gpu_num_threads_z();
+}
+
+
+// SPIRV64: Function Attrs: mustprogress nofree norecurse nosync nounwind 
willreturn memory(none)
+// SPIRV64-LABEL: @thread_id_x(
+// SPIRV64-NEXT:  entry:
+// SPIRV64-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.gpu.thread.id.x()
+// SPIRV64-NEXT:    ret i32 [[TMP0]]
+//
+// AMDHSA: Function Attrs: mustprogress nofree norecurse nosync nounwind 
willreturn memory(none)
+// AMDHSA-LABEL: @thread_id_x(
+// AMDHSA-NEXT:  entry:
+// AMDHSA-NEXT:    [[TMP0:%.*]] = tail call addrspace(4) i32 
@llvm.gpu.thread.id.x()
+// AMDHSA-NEXT:    ret i32 [[TMP0]]
+//
+// NVPTX: Function Attrs: mustprogress nofree norecurse nosync nounwind 
willreturn memory(none)
+// NVPTX-LABEL: @thread_id_x(
+// NVPTX-NEXT:  entry:
+// NVPTX-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.gpu.thread.id.x()
+// NVPTX-NEXT:    ret i32 [[TMP0]]
+//
+// AMDGCN: Function Attrs: mustprogress nofree norecurse nosync nounwind 
willreturn memory(none)
+// AMDGCN-LABEL: @thread_id_x(
+// AMDGCN-NEXT:  entry:
+// AMDGCN-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.gpu.thread.id.x()
+// AMDGCN-NEXT:    ret i32 [[TMP0]]
+//
+uint32_t thread_id_x(void) {
+  return __builtin_gpu_thread_id_x();
+}
+
+
+// SPIRV64: Function Attrs: mustprogress nofree norecurse nosync nounwind 
willreturn memory(none)
+// SPIRV64-LABEL: @thread_id_y(
+// SPIRV64-NEXT:  entry:
+// SPIRV64-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.gpu.thread.id.y()
+// SPIRV64-NEXT:    ret i32 [[TMP0]]
+//
+// AMDHSA: Function Attrs: mustprogress nofree norecurse nosync nounwind 
willreturn memory(none)
+// AMDHSA-LABEL: @thread_id_y(
+// AMDHSA-NEXT:  entry:
+// AMDHSA-NEXT:    [[TMP0:%.*]] = tail call addrspace(4) i32 
@llvm.gpu.thread.id.y()
+// AMDHSA-NEXT:    ret i32 [[TMP0]]
+//
+// NVPTX: Function Attrs: mustprogress nofree norecurse nosync nounwind 
willreturn memory(none)
+// NVPTX-LABEL: @thread_id_y(
+// NVPTX-NEXT:  entry:
+// NVPTX-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.gpu.thread.id.y()
+// NVPTX-NEXT:    ret i32 [[TMP0]]
+//
+// AMDGCN: Function Attrs: mustprogress nofree norecurse nosync nounwind 
willreturn memory(none)
+// AMDGCN-LABEL: @thread_id_y(
+// AMDGCN-NEXT:  entry:
+// AMDGCN-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.gpu.thread.id.y()
+// AMDGCN-NEXT:    ret i32 [[TMP0]]
+//
+uint32_t thread_id_y(void) {
+  return __builtin_gpu_thread_id_y();
+}
+
+
+// SPIRV64: Function Attrs: mustprogress nofree norecurse nosync nounwind 
willreturn memory(none)
+// SPIRV64-LABEL: @thread_id_z(
+// SPIRV64-NEXT:  entry:
+// SPIRV64-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.gpu.thread.id.z()
+// SPIRV64-NEXT:    ret i32 [[TMP0]]
+//
+// AMDHSA: Function Attrs: mustprogress nofree norecurse nosync nounwind 
willreturn memory(none)
+// AMDHSA-LABEL: @thread_id_z(
+// AMDHSA-NEXT:  entry:
+// AMDHSA-NEXT:    [[TMP0:%.*]] = tail call addrspace(4) i32 
@llvm.gpu.thread.id.z()
+// AMDHSA-NEXT:    ret i32 [[TMP0]]
+//
+// NVPTX: Function Attrs: mustprogress nofree norecurse nosync nounwind 
willreturn memory(none)
+// NVPTX-LABEL: @thread_id_z(
+// NVPTX-NEXT:  entry:
+// NVPTX-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.gpu.thread.id.z()
+// NVPTX-NEXT:    ret i32 [[TMP0]]
+//
+// AMDGCN: Function Attrs: mustprogress nofree norecurse nosync nounwind 
willreturn memory(none)
+// AMDGCN-LABEL: @thread_id_z(
+// AMDGCN-NEXT:  entry:
+// AMDGCN-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.gpu.thread.id.z()
+// AMDGCN-NEXT:    ret i32 [[TMP0]]
+//
+uint32_t thread_id_z(void) {
+  return __builtin_gpu_thread_id_z();
+}
+
+
+// SPIRV64: Function Attrs: mustprogress nofree norecurse nosync nounwind 
willreturn memory(none)
+// SPIRV64-LABEL: @num_lanes(
+// SPIRV64-NEXT:  entry:
+// SPIRV64-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.gpu.num.lanes()
+// SPIRV64-NEXT:    ret i32 [[TMP0]]
+//
+// AMDHSA: Function Attrs: mustprogress nofree norecurse nosync nounwind 
willreturn memory(none)
+// AMDHSA-LABEL: @num_lanes(
+// AMDHSA-NEXT:  entry:
+// AMDHSA-NEXT:    [[TMP0:%.*]] = tail call addrspace(4) i32 
@llvm.gpu.num.lanes()
+// AMDHSA-NEXT:    ret i32 [[TMP0]]
+//
+// NVPTX: Function Attrs: mustprogress nofree norecurse nosync nounwind 
willreturn memory(none)
+// NVPTX-LABEL: @num_lanes(
+// NVPTX-NEXT:  entry:
+// NVPTX-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.gpu.num.lanes()
+// NVPTX-NEXT:    ret i32 [[TMP0]]
+//
+// AMDGCN: Function Attrs: mustprogress nofree norecurse nosync nounwind 
willreturn memory(none)
+// AMDGCN-LABEL: @num_lanes(
+// AMDGCN-NEXT:  entry:
+// AMDGCN-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.gpu.num.lanes()
+// AMDGCN-NEXT:    ret i32 [[TMP0]]
+//
+uint32_t num_lanes(void) {
+  return __builtin_gpu_num_lanes();
+}
+
+
+// SPIRV64: Function Attrs: mustprogress nofree norecurse nosync nounwind 
willreturn memory(none)
+// SPIRV64-LABEL: @lane_id(
+// SPIRV64-NEXT:  entry:
+// SPIRV64-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.gpu.lane.id()
+// SPIRV64-NEXT:    ret i32 [[TMP0]]
+//
+// AMDHSA: Function Attrs: mustprogress nofree norecurse nosync nounwind 
willreturn memory(none)
+// AMDHSA-LABEL: @lane_id(
+// AMDHSA-NEXT:  entry:
+// AMDHSA-NEXT:    [[TMP0:%.*]] = tail call addrspace(4) i32 
@llvm.gpu.lane.id()
+// AMDHSA-NEXT:    ret i32 [[TMP0]]
+//
+// NVPTX: Function Attrs: mustprogress nofree norecurse nosync nounwind 
willreturn memory(none)
+// NVPTX-LABEL: @lane_id(
+// NVPTX-NEXT:  entry:
+// NVPTX-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.gpu.lane.id()
+// NVPTX-NEXT:    ret i32 [[TMP0]]
+//
+// AMDGCN: Function Attrs: mustprogress nofree norecurse nosync nounwind 
willreturn memory(none)
+// AMDGCN-LABEL: @lane_id(
+// AMDGCN-NEXT:  entry:
+// AMDGCN-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.gpu.lane.id()
+// AMDGCN-NEXT:    ret i32 [[TMP0]]
+//
+uint32_t lane_id(void) {
+  return __builtin_gpu_lane_id();
+}
+
+
+// SPIRV64: Function Attrs: mustprogress nofree norecurse nosync nounwind 
willreturn memory(none)
+// SPIRV64-LABEL: @lane_mask(
+// SPIRV64-NEXT:  entry:
+// SPIRV64-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.gpu.lane.mask()
+// SPIRV64-NEXT:    ret i64 [[TMP0]]
+//
+// AMDHSA: Function Attrs: mustprogress nofree norecurse nosync nounwind 
willreturn memory(none)
+// AMDHSA-LABEL: @lane_mask(
+// AMDHSA-NEXT:  entry:
+// AMDHSA-NEXT:    [[TMP0:%.*]] = tail call addrspace(4) i64 
@llvm.gpu.lane.mask()
+// AMDHSA-NEXT:    ret i64 [[TMP0]]
+//
+// NVPTX: Function Attrs: mustprogress nofree norecurse nosync nounwind 
willreturn memory(none)
+// NVPTX-LABEL: @lane_mask(
+// NVPTX-NEXT:  entry:
+// NVPTX-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.gpu.lane.mask()
+// NVPTX-NEXT:    ret i64 [[TMP0]]
+//
+// AMDGCN: Function Attrs: mustprogress nofree norecurse nosync nounwind 
willreturn memory(none)
+// AMDGCN-LABEL: @lane_mask(
+// AMDGCN-NEXT:  entry:
+// AMDGCN-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.gpu.lane.mask()
+// AMDGCN-NEXT:    ret i64 [[TMP0]]
+//
+uint64_t lane_mask(void) {
+  return __builtin_gpu_lane_mask();
+}
+
+
+uint32_t
+// SPIRV64: Function Attrs: mustprogress nofree norecurse nosync nounwind 
willreturn memory(none)
+// SPIRV64-LABEL: @read_first_lane_u32(
+// SPIRV64-NEXT:  entry:
+// SPIRV64-NEXT:    [[TMP0:%.*]] = tail call i32 
@llvm.gpu.read.first.lane.u32(i64 [[__LANE_MASK:%.*]], i32 [[__X:%.*]])
+// SPIRV64-NEXT:    ret i32 [[TMP0]]
+//
+// AMDHSA: Function Attrs: mustprogress nofree norecurse nosync nounwind 
willreturn memory(none)
+// AMDHSA-LABEL: @read_first_lane_u32(
+// AMDHSA-NEXT:  entry:
+// AMDHSA-NEXT:    [[TMP0:%.*]] = tail call addrspace(4) i32 
@llvm.gpu.read.first.lane.u32(i64 [[__LANE_MASK:%.*]], i32 [[__X:%.*]])
+// AMDHSA-NEXT:    ret i32 [[TMP0]]
+//
+// NVPTX: Function Attrs: convergent mustprogress nofree norecurse nosync 
nounwind willreturn memory(none)
+// NVPTX-LABEL: @read_first_lane_u32(
+// NVPTX-NEXT:  entry:
+// NVPTX-NEXT:    [[TMP0:%.*]] = tail call i32 
@llvm.gpu.read.first.lane.u32(i64 [[__LANE_MASK:%.*]], i32 [[__X:%.*]])
+// NVPTX-NEXT:    ret i32 [[TMP0]]
+//
+// AMDGCN: Function Attrs: convergent mustprogress nofree norecurse nosync 
nounwind willreturn memory(none)
+// AMDGCN-LABEL: @read_first_lane_u32(
+// AMDGCN-NEXT:  entry:
+// AMDGCN-NEXT:    [[TMP0:%.*]] = tail call i32 
@llvm.gpu.read.first.lane.u32(i64 [[__LANE_MASK:%.*]], i32 [[__X:%.*]])
+// AMDGCN-NEXT:    ret i32 [[TMP0]]
+//
+read_first_lane_u32(uint64_t __lane_mask, uint32_t __x) {
+  return __builtin_gpu_read_first_lane_u32(__lane_mask, __x);
+}
+
+
+// SPIRV64: Function Attrs: mustprogress nofree norecurse nosync nounwind 
willreturn memory(none)
+// SPIRV64-LABEL: @ballot(
+// SPIRV64-NEXT:  entry:
+// SPIRV64-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.gpu.ballot(i64 
[[__LANE_MASK:%.*]], i1 [[__X:%.*]])
+// SPIRV64-NEXT:    ret i64 [[TMP0]]
+//
+// AMDHSA: Function Attrs: mustprogress nofree norecurse nosync nounwind 
willreturn memory(none)
+// AMDHSA-LABEL: @ballot(
+// AMDHSA-NEXT:  entry:
+// AMDHSA-NEXT:    [[TMP0:%.*]] = tail call addrspace(4) i64 
@llvm.gpu.ballot(i64 [[__LANE_MASK:%.*]], i1 [[__X:%.*]])
+// AMDHSA-NEXT:    ret i64 [[TMP0]]
+//
+// NVPTX: Function Attrs: convergent mustprogress nofree norecurse nosync 
nounwind willreturn memory(none)
+// NVPTX-LABEL: @ballot(
+// NVPTX-NEXT:  entry:
+// NVPTX-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.gpu.ballot(i64 
[[__LANE_MASK:%.*]], i1 [[__X:%.*]])
+// NVPTX-NEXT:    ret i64 [[TMP0]]
+//
+// AMDGCN: Function Attrs: convergent mustprogress nofree norecurse nosync 
nounwind willreturn memory(none)
+// AMDGCN-LABEL: @ballot(
+// AMDGCN-NEXT:  entry:
+// AMDGCN-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.gpu.ballot(i64 
[[__LANE_MASK:%.*]], i1 [[__X:%.*]])
+// AMDGCN-NEXT:    ret i64 [[TMP0]]
+//
+uint64_t ballot(uint64_t __lane_mask,
+                            _Bool __x) {
+  return __builtin_gpu_ballot(__lane_mask, __x);
+}
+
+
+// SPIRV64: Function Attrs: mustprogress nofree norecurse nosync nounwind 
willreturn
+// SPIRV64-LABEL: @sync_threads(
+// SPIRV64-NEXT:  entry:
+// SPIRV64-NEXT:    tail call void @llvm.gpu.sync.threads()
+// SPIRV64-NEXT:    ret void
+//
+// AMDHSA: Function Attrs: mustprogress nofree norecurse nosync nounwind 
willreturn
+// AMDHSA-LABEL: @sync_threads(
+// AMDHSA-NEXT:  entry:
+// AMDHSA-NEXT:    tail call addrspace(4) void @llvm.gpu.sync.threads()
+// AMDHSA-NEXT:    ret void
+//
+// NVPTX: Function Attrs: convergent mustprogress nofree norecurse nosync 
nounwind willreturn
+// NVPTX-LABEL: @sync_threads(
+// NVPTX-NEXT:  entry:
+// NVPTX-NEXT:    tail call void @llvm.gpu.sync.threads()
+// NVPTX-NEXT:    ret void
+//
+// AMDGCN: Function Attrs: convergent mustprogress nofree norecurse nosync 
nounwind willreturn
+// AMDGCN-LABEL: @sync_threads(
+// AMDGCN-NEXT:  entry:
+// AMDGCN-NEXT:    tail call void @llvm.gpu.sync.threads()
+// AMDGCN-NEXT:    ret void
+//
+void sync_threads(void) {
+  return __builtin_gpu_sync_threads();
+}
+
+
+// SPIRV64: Function Attrs: mustprogress nofree norecurse nosync nounwind 
willreturn
+// SPIRV64-LABEL: @sync_lane(
+// SPIRV64-NEXT:  entry:
+// SPIRV64-NEXT:    tail call void @llvm.gpu.sync.lane(i64 [[__LANE_MASK:%.*]])
+// SPIRV64-NEXT:    ret void
+//
+// AMDHSA: Function Attrs: mustprogress nofree norecurse nosync nounwind 
willreturn
+// AMDHSA-LABEL: @sync_lane(
+// AMDHSA-NEXT:  entry:
+// AMDHSA-NEXT:    tail call addrspace(4) void @llvm.gpu.sync.lane(i64 
[[__LANE_MASK:%.*]])
+// AMDHSA-NEXT:    ret void
+//
+// NVPTX: Function Attrs: convergent mustprogress nofree norecurse nosync 
nounwind willreturn
+// NVPTX-LABEL: @sync_lane(
+// NVPTX-NEXT:  entry:
+// NVPTX-NEXT:    tail call void @llvm.gpu.sync.lane(i64 [[__LANE_MASK:%.*]])
+// NVPTX-NEXT:    ret void
+//
+// AMDGCN: Function Attrs: convergent mustprogress nofree norecurse nosync 
nounwind willreturn
+// AMDGCN-LABEL: @sync_lane(
+// AMDGCN-NEXT:  entry:
+// AMDGCN-NEXT:    tail call void @llvm.gpu.sync.lane(i64 [[__LANE_MASK:%.*]])
+// AMDGCN-NEXT:    ret void
+//
+void sync_lane(uint64_t __lane_mask) {
+  return __builtin_gpu_sync_lane(__lane_mask);
+}
+
+
+uint32_t
+// SPIRV64: Function Attrs: mustprogress nofree norecurse nosync nounwind 
willreturn memory(none)
+// SPIRV64-LABEL: @shuffle_idx_u32(
+// SPIRV64-NEXT:  entry:
+// SPIRV64-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.gpu.shuffle.idx.u32(i64 
[[__LANE_MASK:%.*]], i32 [[__IDX:%.*]], i32 [[__X:%.*]], i32 [[__WIDTH:%.*]])
+// SPIRV64-NEXT:    ret i32 [[TMP0]]
+//
+// AMDHSA: Function Attrs: mustprogress nofree norecurse nosync nounwind 
willreturn memory(none)
+// AMDHSA-LABEL: @shuffle_idx_u32(
+// AMDHSA-NEXT:  entry:
+// AMDHSA-NEXT:    [[TMP0:%.*]] = tail call addrspace(4) i32 
@llvm.gpu.shuffle.idx.u32(i64 [[__LANE_MASK:%.*]], i32 [[__IDX:%.*]], i32 
[[__X:%.*]], i32 [[__WIDTH:%.*]])
+// AMDHSA-NEXT:    ret i32 [[TMP0]]
+//
+// NVPTX: Function Attrs: convergent mustprogress nofree norecurse nosync 
nounwind willreturn memory(none)
+// NVPTX-LABEL: @shuffle_idx_u32(
+// NVPTX-NEXT:  entry:
+// NVPTX-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.gpu.shuffle.idx.u32(i64 
[[__LANE_MASK:%.*]], i32 [[__IDX:%.*]], i32 [[__X:%.*]], i32 [[__WIDTH:%.*]])
+// NVPTX-NEXT:    ret i32 [[TMP0]]
+//
+// AMDGCN: Function Attrs: convergent mustprogress nofree norecurse nosync 
nounwind willreturn memory(none)
+// AMDGCN-LABEL: @shuffle_idx_u32(
+// AMDGCN-NEXT:  entry:
+// AMDGCN-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.gpu.shuffle.idx.u32(i64 
[[__LANE_MASK:%.*]], i32 [[__IDX:%.*]], i32 [[__X:%.*]], i32 [[__WIDTH:%.*]])
+// AMDGCN-NEXT:    ret i32 [[TMP0]]
+//
+shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x,
+                      uint32_t __width) {
+  return __builtin_gpu_shuffle_idx_u32(__lane_mask, __idx, __x, __width);
+}
+
+// SPIRV64: Function Attrs: nofree norecurse noreturn nosync nounwind
+// SPIRV64-LABEL: @gpu_exit(
+// SPIRV64-NEXT:  entry:
+// SPIRV64-NEXT:    tail call void @llvm.gpu.exit()
+// SPIRV64-NEXT:    unreachable
+//
+// AMDHSA: Function Attrs: nofree norecurse noreturn nosync nounwind
+// AMDHSA-LABEL: @gpu_exit(
+// AMDHSA-NEXT:  entry:
+// AMDHSA-NEXT:    tail call addrspace(4) void @llvm.gpu.exit()
+// AMDHSA-NEXT:    unreachable
+//
+// NVPTX: Function Attrs: convergent nofree norecurse noreturn nosync nounwind
+// NVPTX-LABEL: @gpu_exit(
+// NVPTX-NEXT:  entry:
+// NVPTX-NEXT:    tail call void @llvm.gpu.exit()
+// NVPTX-NEXT:    unreachable
+//
+// AMDGCN: Function Attrs: convergent nofree norecurse noreturn nosync nounwind
+// AMDGCN-LABEL: @gpu_exit(
+// AMDGCN-NEXT:  entry:
+// AMDGCN-NEXT:    tail call void @llvm.gpu.exit()
+// AMDGCN-NEXT:    unreachable
+//
+void gpu_exit(void) {
+  return __builtin_gpu_exit();
+}
+
+
+// SPIRV64: Function Attrs: mustprogress nofree norecurse nosync nounwind 
willreturn
+// SPIRV64-LABEL: @thread_suspend(
+// SPIRV64-NEXT:  entry:
+// SPIRV64-NEXT:    tail call void @llvm.gpu.thread.suspend()
+// SPIRV64-NEXT:    ret void
+//
+// AMDHSA: Function Attrs: mustprogress nofree norecurse nosync nounwind 
willreturn
+// AMDHSA-LABEL: @thread_suspend(
+// AMDHSA-NEXT:  entry:
+// AMDHSA-NEXT:    tail call addrspace(4) void @llvm.gpu.thread.suspend()
+// AMDHSA-NEXT:    ret void
+//
+// NVPTX: Function Attrs: convergent mustprogress nofree norecurse nosync 
nounwind willreturn
+// NVPTX-LABEL: @thread_suspend(
+// NVPTX-NEXT:  entry:
+// NVPTX-NEXT:    tail call void @llvm.gpu.thread.suspend()
+// NVPTX-NEXT:    ret void
+//
+// AMDGCN: Function Attrs: convergent mustprogress nofree norecurse nosync 
nounwind willreturn
+// AMDGCN-LABEL: @thread_suspend(
+// AMDGCN-NEXT:  entry:
+// AMDGCN-NEXT:    tail call void @llvm.gpu.thread.suspend()
+// AMDGCN-NEXT:    ret void
+//
+void thread_suspend(void) {
+  return __builtin_gpu_thread_suspend();
+}
diff --git a/clang/test/Headers/gpuintrin.c b/clang/test/Headers/gpuintrin.c
index 9a15ce277ba87..eaf001be19ac9 100644
--- a/clang/test/Headers/gpuintrin.c
+++ b/clang/test/Headers/gpuintrin.c
@@ -9,6 +9,11 @@
 // RUN:   -target-feature +ptx62 \
 // RUN:   -triple nvptx64-nvidia-cuda -emit-llvm %s -o - \
 // RUN: | FileCheck %s --check-prefix=NVPTX
+//
+// RUN: %clang_cc1 -internal-isystem %S/Inputs/include  \
+// RUN:   -internal-isystem %S/../../lib/Headers/ \
+// RUN:   -triple spirv64-- -emit-llvm %s -o - \
+// RUN: | FileCheck %s --check-prefix=SPIRV64
 
 #include <gpuintrin.h>
 
@@ -978,6 +983,224 @@ __gpu_kernel void foo() {
 // NVPTX-NEXT:    call void @llvm.nvvm.exit()
 // NVPTX-NEXT:    ret void
 //
+//
+// SPIRV64-LABEL: define spir_func void @foo(
+// SPIRV64-SAME: ) #[[ATTR0:[0-9]+]] {
+// SPIRV64-NEXT:  [[ENTRY:.*:]]
+// SPIRV64-NEXT:    [[CALL:%.*]] = call spir_func i32 @__gpu_num_blocks_x()
+// SPIRV64-NEXT:    [[CALL1:%.*]] = call spir_func i32 @__gpu_num_blocks_y()
+// SPIRV64-NEXT:    [[CALL2:%.*]] = call spir_func i32 @__gpu_num_blocks_z()
+// SPIRV64-NEXT:    [[CALL3:%.*]] = call spir_func i32 @__gpu_num_blocks(i32 
noundef 0)
+// SPIRV64-NEXT:    [[CALL4:%.*]] = call spir_func i32 @__gpu_block_id_x()
+// SPIRV64-NEXT:    [[CALL5:%.*]] = call spir_func i32 @__gpu_block_id_y()
+// SPIRV64-NEXT:    [[CALL6:%.*]] = call spir_func i32 @__gpu_block_id_z()
+// SPIRV64-NEXT:    [[CALL7:%.*]] = call spir_func i32 @__gpu_block_id(i32 
noundef 0)
+// SPIRV64-NEXT:    [[CALL8:%.*]] = call spir_func i32 @__gpu_num_threads_x()
+// SPIRV64-NEXT:    [[CALL9:%.*]] = call spir_func i32 @__gpu_num_threads_y()
+// SPIRV64-NEXT:    [[CALL10:%.*]] = call spir_func i32 @__gpu_num_threads_z()
+// SPIRV64-NEXT:    [[CALL11:%.*]] = call spir_func i32 @__gpu_num_threads(i32 
noundef 0)
+// SPIRV64-NEXT:    [[CALL12:%.*]] = call spir_func i32 @__gpu_thread_id_x()
+// SPIRV64-NEXT:    [[CALL13:%.*]] = call spir_func i32 @__gpu_thread_id_y()
+// SPIRV64-NEXT:    [[CALL14:%.*]] = call spir_func i32 @__gpu_thread_id_z()
+// SPIRV64-NEXT:    [[CALL15:%.*]] = call spir_func i32 @__gpu_thread_id(i32 
noundef 0)
+// SPIRV64-NEXT:    [[CALL16:%.*]] = call spir_func i32 @__gpu_num_lanes()
+// SPIRV64-NEXT:    [[CALL17:%.*]] = call spir_func i32 @__gpu_lane_id()
+// SPIRV64-NEXT:    [[CALL18:%.*]] = call spir_func i64 @__gpu_lane_mask()
+// SPIRV64-NEXT:    [[CALL19:%.*]] = call spir_func i32 
@__gpu_read_first_lane_u32(i64 noundef -1, i32 noundef -1)
+// SPIRV64-NEXT:    [[CALL20:%.*]] = call spir_func i64 
@__gpu_read_first_lane_u64(i64 noundef -1, i64 noundef -1)
+// SPIRV64-NEXT:    [[CALL21:%.*]] = call spir_func i64 @__gpu_ballot(i64 
noundef -1, i1 noundef zeroext true)
+// SPIRV64-NEXT:    call spir_func void @__gpu_sync_threads()
+// SPIRV64-NEXT:    call spir_func void @__gpu_sync_lane(i64 noundef -1)
+// SPIRV64-NEXT:    [[CALL22:%.*]] = call spir_func i32 
@__gpu_shuffle_idx_u32(i64 noundef -1, i32 noundef -1, i32 noundef -1, i32 
noundef 0)
+// SPIRV64-NEXT:    [[CALL23:%.*]] = call spir_func i64 
@__gpu_first_lane_id(i64 noundef -1)
+// SPIRV64-NEXT:    [[CALL24:%.*]] = call spir_func zeroext i1 
@__gpu_is_first_in_lane(i64 noundef -1)
+// SPIRV64-NEXT:    call spir_func void @__gpu_exit() #[[ATTR4:[0-9]+]]
+// SPIRV64-NEXT:    unreachable
+//
+//
+// SPIRV64-LABEL: define internal spir_func i32 @__gpu_num_blocks(
+// SPIRV64-SAME: i32 noundef [[__DIM:%.*]]) #[[ATTR0]] {
+// SPIRV64-NEXT:  [[ENTRY:.*:]]
+// SPIRV64-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
+// SPIRV64-NEXT:    [[__DIM_ADDR:%.*]] = alloca i32, align 4
+// SPIRV64-NEXT:    store i32 [[__DIM]], ptr [[__DIM_ADDR]], align 4
+// SPIRV64-NEXT:    [[TMP0:%.*]] = load i32, ptr [[__DIM_ADDR]], align 4
+// SPIRV64-NEXT:    switch i32 [[TMP0]], label %[[SW_DEFAULT:.*]] [
+// SPIRV64-NEXT:      i32 0, label %[[SW_BB:.*]]
+// SPIRV64-NEXT:      i32 1, label %[[SW_BB1:.*]]
+// SPIRV64-NEXT:      i32 2, label %[[SW_BB3:.*]]
+// SPIRV64-NEXT:    ]
+// SPIRV64:       [[SW_BB]]:
+// SPIRV64-NEXT:    [[CALL:%.*]] = call spir_func i32 @__gpu_num_blocks_x()
+// SPIRV64-NEXT:    store i32 [[CALL]], ptr [[RETVAL]], align 4
+// SPIRV64-NEXT:    br label %[[RETURN:.*]]
+// SPIRV64:       [[SW_BB1]]:
+// SPIRV64-NEXT:    [[CALL2:%.*]] = call spir_func i32 @__gpu_num_blocks_y()
+// SPIRV64-NEXT:    store i32 [[CALL2]], ptr [[RETVAL]], align 4
+// SPIRV64-NEXT:    br label %[[RETURN]]
+// SPIRV64:       [[SW_BB3]]:
+// SPIRV64-NEXT:    [[CALL4:%.*]] = call spir_func i32 @__gpu_num_blocks_z()
+// SPIRV64-NEXT:    store i32 [[CALL4]], ptr [[RETVAL]], align 4
+// SPIRV64-NEXT:    br label %[[RETURN]]
+// SPIRV64:       [[SW_DEFAULT]]:
+// SPIRV64-NEXT:    unreachable
+// SPIRV64:       [[RETURN]]:
+// SPIRV64-NEXT:    [[TMP1:%.*]] = load i32, ptr [[RETVAL]], align 4
+// SPIRV64-NEXT:    ret i32 [[TMP1]]
+//
+//
+// SPIRV64-LABEL: define internal spir_func i32 @__gpu_block_id(
+// SPIRV64-SAME: i32 noundef [[__DIM:%.*]]) #[[ATTR0]] {
+// SPIRV64-NEXT:  [[ENTRY:.*:]]
+// SPIRV64-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
+// SPIRV64-NEXT:    [[__DIM_ADDR:%.*]] = alloca i32, align 4
+// SPIRV64-NEXT:    store i32 [[__DIM]], ptr [[__DIM_ADDR]], align 4
+// SPIRV64-NEXT:    [[TMP0:%.*]] = load i32, ptr [[__DIM_ADDR]], align 4
+// SPIRV64-NEXT:    switch i32 [[TMP0]], label %[[SW_DEFAULT:.*]] [
+// SPIRV64-NEXT:      i32 0, label %[[SW_BB:.*]]
+// SPIRV64-NEXT:      i32 1, label %[[SW_BB1:.*]]
+// SPIRV64-NEXT:      i32 2, label %[[SW_BB3:.*]]
+// SPIRV64-NEXT:    ]
+// SPIRV64:       [[SW_BB]]:
+// SPIRV64-NEXT:    [[CALL:%.*]] = call spir_func i32 @__gpu_block_id_x()
+// SPIRV64-NEXT:    store i32 [[CALL]], ptr [[RETVAL]], align 4
+// SPIRV64-NEXT:    br label %[[RETURN:.*]]
+// SPIRV64:       [[SW_BB1]]:
+// SPIRV64-NEXT:    [[CALL2:%.*]] = call spir_func i32 @__gpu_block_id_y()
+// SPIRV64-NEXT:    store i32 [[CALL2]], ptr [[RETVAL]], align 4
+// SPIRV64-NEXT:    br label %[[RETURN]]
+// SPIRV64:       [[SW_BB3]]:
+// SPIRV64-NEXT:    [[CALL4:%.*]] = call spir_func i32 @__gpu_block_id_z()
+// SPIRV64-NEXT:    store i32 [[CALL4]], ptr [[RETVAL]], align 4
+// SPIRV64-NEXT:    br label %[[RETURN]]
+// SPIRV64:       [[SW_DEFAULT]]:
+// SPIRV64-NEXT:    unreachable
+// SPIRV64:       [[RETURN]]:
+// SPIRV64-NEXT:    [[TMP1:%.*]] = load i32, ptr [[RETVAL]], align 4
+// SPIRV64-NEXT:    ret i32 [[TMP1]]
+//
+//
+// SPIRV64-LABEL: define internal spir_func i32 @__gpu_num_threads(
+// SPIRV64-SAME: i32 noundef [[__DIM:%.*]]) #[[ATTR0]] {
+// SPIRV64-NEXT:  [[ENTRY:.*:]]
+// SPIRV64-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
+// SPIRV64-NEXT:    [[__DIM_ADDR:%.*]] = alloca i32, align 4
+// SPIRV64-NEXT:    store i32 [[__DIM]], ptr [[__DIM_ADDR]], align 4
+// SPIRV64-NEXT:    [[TMP0:%.*]] = load i32, ptr [[__DIM_ADDR]], align 4
+// SPIRV64-NEXT:    switch i32 [[TMP0]], label %[[SW_DEFAULT:.*]] [
+// SPIRV64-NEXT:      i32 0, label %[[SW_BB:.*]]
+// SPIRV64-NEXT:      i32 1, label %[[SW_BB1:.*]]
+// SPIRV64-NEXT:      i32 2, label %[[SW_BB3:.*]]
+// SPIRV64-NEXT:    ]
+// SPIRV64:       [[SW_BB]]:
+// SPIRV64-NEXT:    [[CALL:%.*]] = call spir_func i32 @__gpu_num_threads_x()
+// SPIRV64-NEXT:    store i32 [[CALL]], ptr [[RETVAL]], align 4
+// SPIRV64-NEXT:    br label %[[RETURN:.*]]
+// SPIRV64:       [[SW_BB1]]:
+// SPIRV64-NEXT:    [[CALL2:%.*]] = call spir_func i32 @__gpu_num_threads_y()
+// SPIRV64-NEXT:    store i32 [[CALL2]], ptr [[RETVAL]], align 4
+// SPIRV64-NEXT:    br label %[[RETURN]]
+// SPIRV64:       [[SW_BB3]]:
+// SPIRV64-NEXT:    [[CALL4:%.*]] = call spir_func i32 @__gpu_num_threads_z()
+// SPIRV64-NEXT:    store i32 [[CALL4]], ptr [[RETVAL]], align 4
+// SPIRV64-NEXT:    br label %[[RETURN]]
+// SPIRV64:       [[SW_DEFAULT]]:
+// SPIRV64-NEXT:    unreachable
+// SPIRV64:       [[RETURN]]:
+// SPIRV64-NEXT:    [[TMP1:%.*]] = load i32, ptr [[RETVAL]], align 4
+// SPIRV64-NEXT:    ret i32 [[TMP1]]
+//
+//
+// SPIRV64-LABEL: define internal spir_func i32 @__gpu_thread_id(
+// SPIRV64-SAME: i32 noundef [[__DIM:%.*]]) #[[ATTR0]] {
+// SPIRV64-NEXT:  [[ENTRY:.*:]]
+// SPIRV64-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
+// SPIRV64-NEXT:    [[__DIM_ADDR:%.*]] = alloca i32, align 4
+// SPIRV64-NEXT:    store i32 [[__DIM]], ptr [[__DIM_ADDR]], align 4
+// SPIRV64-NEXT:    [[TMP0:%.*]] = load i32, ptr [[__DIM_ADDR]], align 4
+// SPIRV64-NEXT:    switch i32 [[TMP0]], label %[[SW_DEFAULT:.*]] [
+// SPIRV64-NEXT:      i32 0, label %[[SW_BB:.*]]
+// SPIRV64-NEXT:      i32 1, label %[[SW_BB1:.*]]
+// SPIRV64-NEXT:      i32 2, label %[[SW_BB3:.*]]
+// SPIRV64-NEXT:    ]
+// SPIRV64:       [[SW_BB]]:
+// SPIRV64-NEXT:    [[CALL:%.*]] = call spir_func i32 @__gpu_thread_id_x()
+// SPIRV64-NEXT:    store i32 [[CALL]], ptr [[RETVAL]], align 4
+// SPIRV64-NEXT:    br label %[[RETURN:.*]]
+// SPIRV64:       [[SW_BB1]]:
+// SPIRV64-NEXT:    [[CALL2:%.*]] = call spir_func i32 @__gpu_thread_id_y()
+// SPIRV64-NEXT:    store i32 [[CALL2]], ptr [[RETVAL]], align 4
+// SPIRV64-NEXT:    br label %[[RETURN]]
+// SPIRV64:       [[SW_BB3]]:
+// SPIRV64-NEXT:    [[CALL4:%.*]] = call spir_func i32 @__gpu_thread_id_z()
+// SPIRV64-NEXT:    store i32 [[CALL4]], ptr [[RETVAL]], align 4
+// SPIRV64-NEXT:    br label %[[RETURN]]
+// SPIRV64:       [[SW_DEFAULT]]:
+// SPIRV64-NEXT:    unreachable
+// SPIRV64:       [[RETURN]]:
+// SPIRV64-NEXT:    [[TMP1:%.*]] = load i32, ptr [[RETVAL]], align 4
+// SPIRV64-NEXT:    ret i32 [[TMP1]]
+//
+//
+// SPIRV64-LABEL: define internal spir_func i64 @__gpu_read_first_lane_u64(
+// SPIRV64-SAME: i64 noundef [[__LANE_MASK:%.*]], i64 noundef [[__X:%.*]]) 
#[[ATTR0]] {
+// SPIRV64-NEXT:  [[ENTRY:.*:]]
+// SPIRV64-NEXT:    [[__LANE_MASK_ADDR:%.*]] = alloca i64, align 8
+// SPIRV64-NEXT:    [[__X_ADDR:%.*]] = alloca i64, align 8
+// SPIRV64-NEXT:    [[__HI:%.*]] = alloca i32, align 4
+// SPIRV64-NEXT:    [[__LO:%.*]] = alloca i32, align 4
+// SPIRV64-NEXT:    store i64 [[__LANE_MASK]], ptr [[__LANE_MASK_ADDR]], align 
8
+// SPIRV64-NEXT:    store i64 [[__X]], ptr [[__X_ADDR]], align 8
+// SPIRV64-NEXT:    [[TMP0:%.*]] = load i64, ptr [[__X_ADDR]], align 8
+// SPIRV64-NEXT:    [[SHR:%.*]] = lshr i64 [[TMP0]], 32
+// SPIRV64-NEXT:    [[CONV:%.*]] = trunc i64 [[SHR]] to i32
+// SPIRV64-NEXT:    store i32 [[CONV]], ptr [[__HI]], align 4
+// SPIRV64-NEXT:    [[TMP1:%.*]] = load i64, ptr [[__X_ADDR]], align 8
+// SPIRV64-NEXT:    [[AND:%.*]] = and i64 [[TMP1]], 4294967295
+// SPIRV64-NEXT:    [[CONV1:%.*]] = trunc i64 [[AND]] to i32
+// SPIRV64-NEXT:    store i32 [[CONV1]], ptr [[__LO]], align 4
+// SPIRV64-NEXT:    [[TMP2:%.*]] = load i64, ptr [[__LANE_MASK_ADDR]], align 8
+// SPIRV64-NEXT:    [[TMP3:%.*]] = load i32, ptr [[__HI]], align 4
+// SPIRV64-NEXT:    [[CALL:%.*]] = call spir_func i32 
@__gpu_read_first_lane_u32(i64 noundef [[TMP2]], i32 noundef [[TMP3]])
+// SPIRV64-NEXT:    [[CONV2:%.*]] = zext i32 [[CALL]] to i64
+// SPIRV64-NEXT:    [[SHL:%.*]] = shl i64 [[CONV2]], 32
+// SPIRV64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[__LANE_MASK_ADDR]], align 8
+// SPIRV64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[__LO]], align 4
+// SPIRV64-NEXT:    [[CALL3:%.*]] = call spir_func i32 
@__gpu_read_first_lane_u32(i64 noundef [[TMP4]], i32 noundef [[TMP5]])
+// SPIRV64-NEXT:    [[CONV4:%.*]] = zext i32 [[CALL3]] to i64
+// SPIRV64-NEXT:    [[AND5:%.*]] = and i64 [[CONV4]], 4294967295
+// SPIRV64-NEXT:    [[OR:%.*]] = or i64 [[SHL]], [[AND5]]
+// SPIRV64-NEXT:    ret i64 [[OR]]
+//
+//
+// SPIRV64-LABEL: define internal spir_func i64 @__gpu_first_lane_id(
+// SPIRV64-SAME: i64 noundef [[__LANE_MASK:%.*]]) #[[ATTR0]] {
+// SPIRV64-NEXT:  [[ENTRY:.*:]]
+// SPIRV64-NEXT:    [[__LANE_MASK_ADDR:%.*]] = alloca i64, align 8
+// SPIRV64-NEXT:    store i64 [[__LANE_MASK]], ptr [[__LANE_MASK_ADDR]], align 
8
+// SPIRV64-NEXT:    [[TMP0:%.*]] = load i64, ptr [[__LANE_MASK_ADDR]], align 8
+// SPIRV64-NEXT:    [[TMP1:%.*]] = call i64 @llvm.cttz.i64(i64 [[TMP0]], i1 
true)
+// SPIRV64-NEXT:    [[TMP2:%.*]] = add i64 [[TMP1]], 1
+// SPIRV64-NEXT:    [[ISZERO:%.*]] = icmp eq i64 [[TMP0]], 0
+// SPIRV64-NEXT:    [[FFS:%.*]] = select i1 [[ISZERO]], i64 0, i64 [[TMP2]]
+// SPIRV64-NEXT:    [[CAST:%.*]] = trunc i64 [[FFS]] to i32
+// SPIRV64-NEXT:    [[SUB:%.*]] = sub nsw i32 [[CAST]], 1
+// SPIRV64-NEXT:    [[CONV:%.*]] = sext i32 [[SUB]] to i64
+// SPIRV64-NEXT:    ret i64 [[CONV]]
+//
+//
+// SPIRV64-LABEL: define internal spir_func zeroext i1 @__gpu_is_first_in_lane(
+// SPIRV64-SAME: i64 noundef [[__LANE_MASK:%.*]]) #[[ATTR0]] {
+// SPIRV64-NEXT:  [[ENTRY:.*:]]
+// SPIRV64-NEXT:    [[__LANE_MASK_ADDR:%.*]] = alloca i64, align 8
+// SPIRV64-NEXT:    store i64 [[__LANE_MASK]], ptr [[__LANE_MASK_ADDR]], align 
8
+// SPIRV64-NEXT:    [[CALL:%.*]] = call spir_func i32 @__gpu_lane_id()
+// SPIRV64-NEXT:    [[CONV:%.*]] = zext i32 [[CALL]] to i64
+// SPIRV64-NEXT:    [[TMP0:%.*]] = load i64, ptr [[__LANE_MASK_ADDR]], align 8
+// SPIRV64-NEXT:    [[CALL1:%.*]] = call spir_func i64 
@__gpu_first_lane_id(i64 noundef [[TMP0]])
+// SPIRV64-NEXT:    [[CMP:%.*]] = icmp eq i64 [[CONV]], [[CALL1]]
+// SPIRV64-NEXT:    ret i1 [[CMP]]
+//
 //.
 // AMDGPU: [[RNG3]] = !{i32 1, i32 0}
 // AMDGPU: [[META4]] = !{}
diff --git a/llvm/include/llvm/IR/Intrinsics.td 
b/llvm/include/llvm/IR/Intrinsics.td
index 62239ca705b9e..c0613786a13bb 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -2861,6 +2861,69 @@ def int_experimental_convergence_anchor
 def int_experimental_convergence_loop
   : DefaultAttrsIntrinsic<[llvm_token_ty], [], [IntrNoMem, IntrConvergent]>;
 
+//===------- GPU Intrinsics 
-----------------------------------------------===//
+
+class GPUIntrinsic<LLVMType ret_type, string name>
+  : DefaultAttrsIntrinsic<[ret_type],
+                          [],
+                          [NoUndef<RetIndex>, IntrNoMem, IntrSpeculatable]>,
+                          ClangBuiltin<name>;
+
+multiclass GPUGridIntrinsic_xyz<string prefix> {
+  def _x : GPUIntrinsic<llvm_i32_ty, !strconcat(prefix, "_x")>;
+  def _y : GPUIntrinsic<llvm_i32_ty, !strconcat(prefix, "_y")>;
+  def _z : GPUIntrinsic<llvm_i32_ty, !strconcat(prefix, "_z")>;
+}
+
+defm int_gpu_num_blocks : GPUGridIntrinsic_xyz<"__builtin_gpu_num_blocks">;
+defm int_gpu_block_id : GPUGridIntrinsic_xyz<"__builtin_gpu_block_id">;
+defm int_gpu_num_threads : GPUGridIntrinsic_xyz<"__builtin_gpu_num_threads">;
+defm int_gpu_thread_id : GPUGridIntrinsic_xyz<"__builtin_gpu_thread_id">;
+
+def int_gpu_num_lanes : GPUIntrinsic<llvm_i32_ty,"__builtin_gpu_num_lanes">;
+def int_gpu_lane_id : GPUIntrinsic<llvm_i32_ty,"__builtin_gpu_lane_id">;
+def int_gpu_lane_mask : GPUIntrinsic<llvm_i64_ty,"__builtin_gpu_lane_mask">;
+
+def int_gpu_read_first_lane_u32 : DefaultAttrsIntrinsic<[llvm_i32_ty],
+                                                        [llvm_i64_ty, 
llvm_i32_ty],
+                                                        [IntrNoMem, 
IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>,
+                                                        
ClangBuiltin<"__builtin_gpu_read_first_lane_u32">;
+
+def int_gpu_shuffle_idx_u32 : DefaultAttrsIntrinsic<[llvm_i32_ty],
+                                                    [llvm_i64_ty, llvm_i32_ty, 
llvm_i32_ty, llvm_i32_ty],
+                                                    [IntrNoMem, 
IntrConvergent, IntrSpeculatable, IntrWillReturn, IntrNoCallback, IntrNoFree]>,
+                                                    
ClangBuiltin<"__builtin_gpu_shuffle_idx_u32">;
+
+def int_gpu_ballot : DefaultAttrsIntrinsic<[llvm_i64_ty],
+                                           [llvm_i64_ty, llvm_i1_ty],
+                                           [IntrNoMem, IntrConvergent, 
IntrWillReturn, IntrNoCallback, IntrNoFree]>,
+                                           
ClangBuiltin<"__builtin_gpu_ballot">;
+
+def int_gpu_sync_threads : DefaultAttrsIntrinsic<[], [],
+    // todo, attributes match barrier, but there's a fence in here too
+    // also why is there a fence in here?
+                                                 [IntrNoMem, 
IntrHasSideEffects, IntrConvergent,
+                                                  IntrWillReturn, 
IntrNoCallback, IntrNoFree]>,
+                                                 
ClangBuiltin<"__builtin_gpu_sync_threads">;
+
+def int_gpu_sync_lane : DefaultAttrsIntrinsic<[],
+                                              [llvm_i64_ty],
+                                              [IntrNoMem, IntrHasSideEffects, 
IntrConvergent,
+                                              IntrWillReturn, IntrNoCallback, 
IntrNoFree]>,
+                                              
ClangBuiltin<"__builtin_gpu_sync_lane">;
+
+def int_gpu_exit : DefaultAttrsIntrinsic<[],
+                                         [],
+                                         [IntrNoReturn, IntrConvergent]>,
+                                         ClangBuiltin<"__builtin_gpu_exit">;
+
+
+def int_gpu_thread_suspend : DefaultAttrsIntrinsic<[],
+                                                   [],
+                                                   [IntrWillReturn, IntrNoMem, 
IntrHasSideEffects, IntrConvergent]>,
+                                                   
ClangBuiltin<"__builtin_gpu_thread_suspend">;
+
+
 
//===----------------------------------------------------------------------===//
 // Target-specific intrinsics
 
//===----------------------------------------------------------------------===//
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td 
b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 86e050333acc7..91095f2880d03 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -150,6 +150,8 @@ defm int_amdgcn_workitem_id : 
AMDGPUReadPreloadRegisterIntrinsic_xyz;
 defm int_amdgcn_workgroup_id : AMDGPUReadPreloadRegisterIntrinsic_xyz_named
                                <"__builtin_amdgcn_workgroup_id">;
 
+defm int_amdgcn_grid_size : AMDGPUReadPreloadRegisterIntrinsic_xyz;
+
 def int_amdgcn_dispatch_ptr :
   DefaultAttrsIntrinsic<[LLVMQualPointerType<4>], [],
   [Align<RetIndex, 4>, NoUndef<RetIndex>, NonNull<RetIndex>, IntrNoMem, 
IntrSpeculatable]>;
diff --git a/llvm/include/llvm/InitializePasses.h 
b/llvm/include/llvm/InitializePasses.h
index 78ff93019fd7e..9cbe7b068c394 100644
--- a/llvm/include/llvm/InitializePasses.h
+++ b/llvm/include/llvm/InitializePasses.h
@@ -175,6 +175,7 @@ void initializeLoopUnrollPass(PassRegistry &);
 void initializeLowerAtomicLegacyPassPass(PassRegistry &);
 void initializeLowerEmuTLSPass(PassRegistry &);
 void initializeLowerGlobalDtorsLegacyPassPass(PassRegistry &);
+void initializeLowerGPUIntrinsicPass(PassRegistry &);
 void initializeLowerIntrinsicsPass(PassRegistry &);
 void initializeLowerInvokeLegacyPassPass(PassRegistry &);
 void initializeLowerSwitchLegacyPassPass(PassRegistry &);
diff --git a/llvm/include/llvm/Transforms/Scalar.h 
b/llvm/include/llvm/Transforms/Scalar.h
index fc772a7639c47..d746b7a39b871 100644
--- a/llvm/include/llvm/Transforms/Scalar.h
+++ b/llvm/include/llvm/Transforms/Scalar.h
@@ -135,6 +135,12 @@ FunctionPass *createSinkingPass();
 //
 Pass *createLowerAtomicPass();
 
+//===----------------------------------------------------------------------===//
+//
+// LowerGPUIntrinsic - Lower GPU intrinsics
+//
+Pass *createLowerGPUIntrinsicPass();
+
 
//===----------------------------------------------------------------------===//
 //
 // MergeICmps - Merge integer comparison chains into a memcmp
diff --git a/llvm/include/llvm/Transforms/Scalar/LowerGPUIntrinsic.h 
b/llvm/include/llvm/Transforms/Scalar/LowerGPUIntrinsic.h
new file mode 100644
index 0000000000000..6e793d4965287
--- /dev/null
+++ b/llvm/include/llvm/Transforms/Scalar/LowerGPUIntrinsic.h
@@ -0,0 +1,26 @@
+//===--- LowerGPUIntrinsic.h - Lower GPU intrinsics -------------*- C++ 
-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass lowers GPU intrinsics.
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_TRANSFORMS_SCALAR_LOWERGPUINTRINSIC_H
+#define LLVM_TRANSFORMS_SCALAR_LOWERGPUINTRINSIC_H
+
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+
+struct LowerGPUIntrinsicPass : public PassInfoMixin<LowerGPUIntrinsicPass> {
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+
+  static bool isRequired() { return true; } // otherwise O0 doesn't run it
+};
+} // namespace llvm
+
+#endif // LLVM_TRANSFORMS_SCALAR_LOWERGPUINTRINSIC_H
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 99e78d3b6feb8..32234df9a364f 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -295,6 +295,7 @@
 #include "llvm/Transforms/Scalar/LowerAtomicPass.h"
 #include "llvm/Transforms/Scalar/LowerConstantIntrinsics.h"
 #include "llvm/Transforms/Scalar/LowerExpectIntrinsic.h"
+#include "llvm/Transforms/Scalar/LowerGPUIntrinsic.h"
 #include "llvm/Transforms/Scalar/LowerGuardIntrinsic.h"
 #include "llvm/Transforms/Scalar/LowerMatrixIntrinsics.h"
 #include "llvm/Transforms/Scalar/LowerWidenableCondition.h"
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index 586d4b7e02fc1..f24b0a2e37329 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -96,6 +96,7 @@ MODULE_PASS("iroutliner", IROutlinerPass())
 MODULE_PASS("jmc-instrumenter", JMCInstrumenterPass())
 MODULE_PASS("lower-emutls", LowerEmuTLSPass())
 MODULE_PASS("lower-global-dtors", LowerGlobalDtorsPass())
+MODULE_PASS("lower-gpu-intrinsic", LowerGPUIntrinsicPass())
 MODULE_PASS("lower-ifunc", LowerIFuncPass())
 MODULE_PASS("lowertypetests", LowerTypeTestsPass())
 MODULE_PASS("fatlto-cleanup", FatLtoCleanup())
diff --git a/llvm/lib/Transforms/Scalar/CMakeLists.txt 
b/llvm/lib/Transforms/Scalar/CMakeLists.txt
index 84a5b02043d01..f35c81f2e661b 100644
--- a/llvm/lib/Transforms/Scalar/CMakeLists.txt
+++ b/llvm/lib/Transforms/Scalar/CMakeLists.txt
@@ -51,6 +51,7 @@ add_llvm_component_library(LLVMScalarOpts
   LowerAtomicPass.cpp
   LowerConstantIntrinsics.cpp
   LowerExpectIntrinsic.cpp
+  LowerGPUIntrinsic.cpp
   LowerGuardIntrinsic.cpp
   LowerMatrixIntrinsics.cpp
   LowerWidenableCondition.cpp
diff --git a/llvm/lib/Transforms/Scalar/LowerGPUIntrinsic.cpp 
b/llvm/lib/Transforms/Scalar/LowerGPUIntrinsic.cpp
new file mode 100644
index 0000000000000..6a0b70e52c309
--- /dev/null
+++ b/llvm/lib/Transforms/Scalar/LowerGPUIntrinsic.cpp
@@ -0,0 +1,501 @@
+//===- LowerGPUIntrinsic.cpp 
----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Lower the llvm.gpu intrinsics to target specific code sequences.
+// Can be called from clang if building for a specific GPU or from the backend
+// as part of a SPIRV lowering pipeline. Initial pass can lower to amdgcn or
+// nvptx, adding further architectures means adding a column to the lookup 
table
+// and further intrinsics adding a row.
+//
+// The idea is for the intrinsics to represent a thin abstraction over the
+// different GPU architectures. In particular, code compiled to spirv-- without
+// specifying a specific target can be specialised at JIT time, at which point
+// this pass will rewrite those intrinsics to ones that the current backend
+// knows.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LowerGPUIntrinsic.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/ConstantRange.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/IR/IntrinsicsNVPTX.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/TargetParser/Triple.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+#define DEBUG_TYPE "lower-gpu-intrinsic"
+
+using namespace llvm;
+
+namespace {
+
+// For each intrinsic, specify what function to call to lower it
+typedef bool (*lowerFunction)(Module &M, IRBuilder<> &, Intrinsic::ID from,
+                              CallBase *CI);
+
+// Simple lowering, directly replace the intrinsic with a different one
+// with the same type, and optionally refine range metadata on the return value
+template <Intrinsic::ID To>
+bool S(Module &M, IRBuilder<> &, Intrinsic::ID from, CallBase *CI) {
+
+  static_assert(To != Intrinsic::not_intrinsic);
+  Intrinsic::ID GenericID = from;
+  Intrinsic::ID SpecificID = To;
+
+  bool Changed = false;
+  Function *Generic = Intrinsic::getDeclarationIfExists(&M, GenericID);
+  auto *Specific = Intrinsic::getOrInsertDeclaration(&M, SpecificID);
+
+  if ((Generic->getType() != Specific->getType()) ||
+      (Generic->getReturnType() != Specific->getReturnType()))
+    report_fatal_error("LowerGPUIntrinsic: Inconsistent types between "
+                       "intrinsics in lookup table");
+
+  CI->setCalledFunction(Specific);
+  Changed = true;
+
+  return Changed;
+}
+
+// Replace intrinsic call with a linear sequence of instructions
+typedef Value *(*builder)(Module &M, IRBuilder<> &Builder, Intrinsic::ID from,
+                          CallBase *CI);
+
+template <builder F>
+bool B(Module &M, IRBuilder<> &Builder, Intrinsic::ID from, CallBase *CI) {
+  bool Changed = false;
+
+  Builder.SetInsertPoint(CI);
+
+  Value *replacement = F(M, Builder, from, CI);
+  if (replacement) {
+    CI->replaceAllUsesWith(replacement);
+    CI->eraseFromParent();
+    Changed = true;
+  }
+
+  return Changed;
+}
+
+template <Intrinsic::ID Numerator, Intrinsic::ID Denominator>
+Value *intrinsicRatio(Module &M, IRBuilder<> &Builder, Intrinsic::ID,
+                      CallBase *) {
+  Value *N = Builder.CreateIntrinsic(Numerator, {}, {});
+  Value *D = Builder.CreateIntrinsic(Denominator, {}, {});
+  return Builder.CreateUDiv(N, D);
+}
+
+namespace amdgpu {
+Value *lane_mask(Module &M, IRBuilder<> &Builder, Intrinsic::ID from,
+                 CallBase *CI) {
+  auto &Ctx = M.getContext();
+  return Builder.CreateIntrinsic(
+      Intrinsic::amdgcn_ballot, {Type::getInt64Ty(Ctx)},
+      {ConstantInt::get(Type::getInt1Ty(Ctx), true)});
+}
+
+Value *lane_id(Module &M, IRBuilder<> &Builder, Intrinsic::ID from,
+               CallBase *CI) {
+  auto &Ctx = M.getContext();
+  Constant *M1 = ConstantInt::get(Type::getInt32Ty(Ctx), -1);
+  Constant *Z = ConstantInt::get(Type::getInt32Ty(Ctx), 0);
+
+  CallInst *Lo =
+      Builder.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {}, {M1, Z});
+  return Builder.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {}, {M1, Lo});
+}
+
+Value *first_lane(Module &M, IRBuilder<> &Builder, Intrinsic::ID from,
+                  CallBase *CI) {
+  auto &Ctx = M.getContext();
+  return Builder.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane,
+                                 {Type::getInt32Ty(Ctx)},
+                                 {CI->getArgOperand(1)});
+}
+
+Value *shuffle_idx(Module &M, IRBuilder<> &Builder, Intrinsic::ID from,
+                   CallBase *CI) {
+  auto &Ctx = M.getContext();
+
+  Value *idx = CI->getArgOperand(1);
+  Value *x = CI->getArgOperand(2);
+  Value *width = CI->getArgOperand(3);
+
+  Value *id = Builder.CreateIntrinsic(Intrinsic::gpu_lane_id, {}, {});
+
+  Value *n = Builder.CreateSub(ConstantInt::get(Type::getInt32Ty(Ctx), 0),
+                               width, "not");
+  Value *a = Builder.CreateAnd(id, n, "and");
+  Value *add = Builder.CreateAdd(a, idx, "add");
+  Value *shl =
+      Builder.CreateShl(add, ConstantInt::get(Type::getInt32Ty(Ctx), 2), 
"shl");
+  return Builder.CreateIntrinsic(Intrinsic::amdgcn_ds_bpermute, {}, {shl, x});
+}
+
+Value *ballot(Module &M, IRBuilder<> &Builder, Intrinsic::ID from,
+              CallBase *CI) {
+  auto &Ctx = M.getContext();
+
+  Value *C =
+      Builder.CreateIntrinsic(Intrinsic::amdgcn_ballot, 
{Type::getInt64Ty(Ctx)},
+                              {CI->getArgOperand(1)});
+
+  return Builder.CreateAnd(C, CI->getArgOperand(0));
+}
+
+Value *sync_threads(Module &M, IRBuilder<> &Builder, Intrinsic::ID from,
+                    CallBase *CI) {
+  auto &Ctx = M.getContext();
+  Builder.CreateIntrinsic(Intrinsic::amdgcn_s_barrier, {}, {});
+
+  Value *F = Builder.CreateFence(AtomicOrdering::SequentiallyConsistent,
+                                 Ctx.getOrInsertSyncScopeID("workgroup"));
+
+  return F;
+}
+
+Value *sync_lane(Module &M, IRBuilder<> &Builder, Intrinsic::ID from,
+                 CallBase *CI) {
+  return Builder.CreateIntrinsic(Intrinsic::amdgcn_wave_barrier, {}, {});
+}
+
+Value *thread_suspend(Module &M, IRBuilder<> &Builder, Intrinsic::ID from,
+                      CallBase *CI) {
+
+  auto &Ctx = M.getContext();
+  return Builder.CreateIntrinsic(Intrinsic::amdgcn_s_sleep, {},
+                                 {ConstantInt::get(Type::getInt32Ty(Ctx), 2)});
+}
+
+Value *dispatch_ptr(IRBuilder<> &Builder) {
+  CallInst *Call =
+      Builder.CreateIntrinsic(Intrinsic::amdgcn_dispatch_ptr, {}, {});
+  Call->addRetAttr(
+      Attribute::getWithDereferenceableBytes(Call->getContext(), 64));
+  Call->addRetAttr(Attribute::getWithAlignment(Call->getContext(), Align(4)));
+  return Call;
+}
+
+Value *implicit_arg_ptr(IRBuilder<> &Builder) {
+  CallInst *Call =
+      Builder.CreateIntrinsic(Intrinsic::amdgcn_implicitarg_ptr, {}, {});
+  Call->addRetAttr(
+      Attribute::getWithDereferenceableBytes(Call->getContext(), 256));
+  Call->addRetAttr(Attribute::getWithAlignment(Call->getContext(), Align(8)));
+  return Call;
+}
+
+template <unsigned Index>
+Value *grid_size(Module &M, IRBuilder<> &Builder, Intrinsic::ID, CallBase *) {
+  auto &Ctx = M.getContext();
+  const unsigned XOffset = 12;
+  auto *DP = dispatch_ptr(Builder);
+
+  // Indexing the HSA kernel_dispatch_packet struct.
+  auto *Offset = ConstantInt::get(Type::getInt32Ty(Ctx), XOffset + Index * 4);
+  auto *GEP = Builder.CreateGEP(Type::getInt8Ty(Ctx), DP, Offset);
+  auto *LD = Builder.CreateLoad(Type::getInt32Ty(Ctx), GEP);
+  llvm::MDBuilder MDB(Ctx);
+  // Known non-zero.
+  LD->setMetadata(llvm::LLVMContext::MD_range,
+                  MDB.createRange(APInt(32, 1), APInt::getZero(32)));
+  LD->setMetadata(llvm::LLVMContext::MD_invariant_load,
+                  llvm::MDNode::get(Ctx, {}));
+  return LD;
+}
+
+template <int Index>
+Value *WGSize(Module &M, IRBuilder<> &Builder, Intrinsic::ID from,
+              CallBase *CI) {
+
+  // Note: "__oclc_ABI_version" is supposed to be emitted and initialized by
+  //       clang during compilation of user code.
+  StringRef Name = "__oclc_ABI_version";
+  auto *ABIVersionC = M.getNamedGlobal(Name);
+  if (!ABIVersionC) {
+    // In CGBuiltin, we'd have to create an extern variable to emit the load 
for
+    // Here, we can leave the intrinsic in place and it'll get lowered later
+    return nullptr;
+  }
+  auto &Ctx = M.getContext();
+
+  Value *ABIVersion = Builder.CreateLoad(Type::getInt32Ty(Ctx), ABIVersionC);
+
+  Value *IsCOV5 = Builder.CreateICmpSGE(
+      ABIVersion,
+      ConstantInt::get(Type::getInt32Ty(Ctx), CodeObjectVersionKind::COV_5));
+
+  Value *ImplicitGEP = Builder.CreateConstGEP1_32(
+      Type::getInt8Ty(Ctx), implicit_arg_ptr(Builder), 12 + Index * 2);
+
+  // Indexing the HSA kernel_dispatch_packet struct.
+  Value *DispatchGEP = Builder.CreateConstGEP1_32(
+      Type::getInt8Ty(Ctx), dispatch_ptr(Builder), 4 + Index * 2);
+
+  auto Result = Builder.CreateSelect(IsCOV5, ImplicitGEP, DispatchGEP);
+  LoadInst *LD = Builder.CreateLoad(Type::getInt16Ty(Ctx), Result);
+
+  // TODO: CGBuiltin digs MaxOpenCLWorkGroupSize out of targetinfo and limtis
+  // the range on the load based on that (MD_range)
+
+  LD->setMetadata(llvm::LLVMContext::MD_noundef, llvm::MDNode::get(Ctx, {}));
+  LD->setMetadata(llvm::LLVMContext::MD_invariant_load,
+                  llvm::MDNode::get(Ctx, {}));
+
+  // The workgroup size is a uint16_t but gpu_block_id returns a uint32_t
+  return Builder.CreateZExt(LD, Type::getInt32Ty(Ctx));
+}
+
+} // namespace amdgpu
+
+namespace nvptx {
+Value *lane_mask(Module &M, IRBuilder<> &Builder, Intrinsic::ID from,
+                 CallBase *CI) {
+  auto &Ctx = M.getContext();
+  CallInst *C = Builder.CreateIntrinsic(Intrinsic::nvvm_activemask, {}, {});
+  return Builder.CreateZExt(C, Type::getInt64Ty(Ctx), "conv");
+}
+
+Value *first_lane(Module &M, IRBuilder<> &Builder, Intrinsic::ID from,
+                  CallBase *CI) {
+  auto &Ctx = M.getContext();
+  Value *conv =
+      Builder.CreateTrunc(CI->getArgOperand(0), Type::getInt32Ty(Ctx), "conv");
+  Value *C = Builder.CreateIntrinsic(
+      Intrinsic::cttz, {Type::getInt32Ty(Ctx)},
+      {conv, ConstantInt::get(Type::getInt1Ty(Ctx), true)});
+  Value *iszero = Builder.CreateICmpEQ(
+      conv, ConstantInt::get(Type::getInt32Ty(Ctx), 0), "iszero");
+  Value *sub = Builder.CreateSelect(
+      iszero, ConstantInt::get(Type::getInt32Ty(Ctx), -1), C, "sub");
+
+  return Builder.CreateIntrinsic(Intrinsic::nvvm_shfl_sync_idx_i32, {},
+                                 {conv, CI->getArgOperand(1), sub,
+                                  ConstantInt::get(Type::getInt32Ty(Ctx), 
31)});
+}
+
+Value *shuffle_idx(Module &M, IRBuilder<> &Builder, Intrinsic::ID from,
+                   CallBase *CI) {
+  auto &Ctx = M.getContext();
+
+  Value *lane_mask = CI->getArgOperand(0);
+  Value *idx = CI->getArgOperand(1);
+  Value *x = CI->getArgOperand(2);
+  Value *width = CI->getArgOperand(3);
+
+  Value *Conv = Builder.CreateTrunc(lane_mask, Type::getInt32Ty(Ctx), "conv");
+
+  Value *sh_prom = Builder.CreateZExt(idx, Type::getInt64Ty(Ctx), "sh_prom");
+  Value *shl0 =
+      Builder.CreateShl(width, ConstantInt::get(Type::getInt32Ty(Ctx), 8));
+  Value *or0 = Builder.CreateSub(ConstantInt::get(Type::getInt32Ty(Ctx), 8223),
+                                 shl0, "or");
+
+  Value *core = Builder.CreateIntrinsic(Intrinsic::nvvm_shfl_sync_idx_i32, {},
+                                        {Conv, x, idx, or0});
+
+  Value *shl1 =
+      Builder.CreateShl(ConstantInt::get(Type::getInt64Ty(Ctx), 1), sh_prom);
+  Value *and0 = Builder.CreateAnd(shl1, lane_mask);
+  Value *cmp =
+      Builder.CreateICmpEQ(and0, ConstantInt::get(Type::getInt64Ty(Ctx), 0));
+  Value *and4 = Builder.CreateSelect(
+      cmp, ConstantInt::get(Type::getInt32Ty(Ctx), 0), core, "and4");
+
+  return and4;
+}
+
+Value *ballot(Module &M, IRBuilder<> &Builder, Intrinsic::ID from,
+              CallBase *CI) {
+  auto &Ctx = M.getContext();
+  Value *Conv =
+      Builder.CreateTrunc(CI->getArgOperand(0), Type::getInt32Ty(Ctx), "conv");
+  Value *C = Builder.CreateIntrinsic(Intrinsic::nvvm_vote_ballot_sync, {},
+                                     {Conv, CI->getArgOperand(1)});
+
+  return Builder.CreateZExt(C, Type::getInt64Ty(Ctx), "conv");
+}
+
+Value *sync_lane(Module &M, IRBuilder<> &Builder, Intrinsic::ID from,
+                 CallBase *CI) {
+
+  auto &Ctx = M.getContext();
+  Value *X = Builder.CreateTrunc(CI->getArgOperand(0), Type::getInt32Ty(Ctx));
+  return Builder.CreateIntrinsic(Intrinsic::nvvm_bar_warp_sync, {}, {X});
+}
+
+Value *thread_suspend(Module &M, IRBuilder<> &Builder, Intrinsic::ID from,
+                      CallBase *CI) {
+
+  auto &Ctx = M.getContext();
+  const DataLayout &DL = M.getDataLayout();
+
+  Value *str = Builder.CreateGlobalString(
+      "__CUDA_ARCH", "", DL.getDefaultGlobalsAddressSpace(), &M);
+
+  Builder.SetInsertPoint(CI);
+  Value *Reflect = Builder.CreateIntrinsic(Intrinsic::nvvm_reflect, {}, {str});
+  Value *Cmp = Builder.CreateICmpUGT(
+      Reflect, ConstantInt::get(Type::getInt32Ty(Ctx), 699));
+
+  Builder.SetInsertPoint(SplitBlockAndInsertIfThen(Cmp, CI, false));
+
+  Builder.CreateIntrinsic(Intrinsic::nvvm_nanosleep, {},
+                          {ConstantInt::get(Type::getInt32Ty(Ctx), 64)});
+
+  CI->eraseFromParent();
+  return nullptr; // All done
+}
+
+} // namespace nvptx
+
+struct IntrinsicMap {
+  Intrinsic::ID Generic;
+  lowerFunction AMDGPU;
+  lowerFunction NVPTX;
+};
+
+using namespace Intrinsic;
+
+static const IntrinsicMap ls[] = {
+    // This table of intrinsic => what to do with it is walked in order.
+    // A row can create calls to intrinsics that are expanded in subsequent 
rows
+    // but that does mean that the order of rows is somewhat significant.
+    // S<intrinsic> is a simple lowering to an existing intrinsic
+    // B<function> involves building a short sequence of instructions
+
+    // amdgpu defines num_blocks as an integer ratio of two other intrinsics.
+    // amdgcn_grid_size and gpu_num_threads are expanded further down.
+    {
+        gpu_num_blocks_x,
+        B<intrinsicRatio<amdgcn_grid_size_x, gpu_num_threads_x>>,
+        S<nvvm_read_ptx_sreg_nctaid_x>,
+    },
+    {
+        gpu_num_blocks_y,
+        B<intrinsicRatio<amdgcn_grid_size_y, gpu_num_threads_y>>,
+        S<nvvm_read_ptx_sreg_nctaid_y>,
+    },
+    {
+        gpu_num_blocks_z,
+        B<intrinsicRatio<amdgcn_grid_size_z, gpu_num_threads_z>>,
+        S<nvvm_read_ptx_sreg_nctaid_z>,
+    },
+
+    // Note: Could canonicalise in favour of the target agnostic one without
+    // breaking existing users of builtin or intrinsic:
+    //  {amdgcn_workgroup_id_x, S<gpu_block_id_x>, nullptr},
+    //  {gpu_block_id_x, nullptr, S<nvvm_read_ptx_sreg_ctaid_x>},
+    // Using the target agnostic one throughout the rest of the backend would
+    // work fine, and amdgpu-no-workgroup-id-x attribute and similar may be
+    // applicable to other targets.
+    // Map {block,thread}_id onto existing intrinsics for the time being.
+    {gpu_block_id_x, S<amdgcn_workgroup_id_x>, S<nvvm_read_ptx_sreg_ctaid_x>},
+    {gpu_block_id_y, S<amdgcn_workgroup_id_y>, S<nvvm_read_ptx_sreg_ctaid_y>},
+    {gpu_block_id_z, S<amdgcn_workgroup_id_z>, S<nvvm_read_ptx_sreg_ctaid_z>},
+    {gpu_thread_id_x, S<amdgcn_workitem_id_x>, S<nvvm_read_ptx_sreg_tid_x>},
+    {gpu_thread_id_y, S<amdgcn_workitem_id_y>, S<nvvm_read_ptx_sreg_tid_y>},
+    {gpu_thread_id_z, S<amdgcn_workitem_id_z>, S<nvvm_read_ptx_sreg_tid_z>},
+
+    // CGBuiltin maps builtin_amdgcn_workgroup_size onto gpu_num_threads
+    {gpu_num_threads_x, B<amdgpu::WGSize<0>>, S<nvvm_read_ptx_sreg_ntid_x>},
+    {gpu_num_threads_y, B<amdgpu::WGSize<1>>, S<nvvm_read_ptx_sreg_ntid_y>},
+    {gpu_num_threads_z, B<amdgpu::WGSize<2>>, S<nvvm_read_ptx_sreg_ntid_z>},
+
+    // Some of the following intrinsics need minor impedance matching
+    {gpu_num_lanes, S<amdgcn_wavefrontsize>, S<nvvm_read_ptx_sreg_warpsize>},
+    {gpu_lane_mask, B<amdgpu::lane_mask>, B<nvptx::lane_mask>},
+
+    {gpu_read_first_lane_u32, B<amdgpu::first_lane>, B<nvptx::first_lane>},
+    {gpu_shuffle_idx_u32, B<amdgpu::shuffle_idx>, B<nvptx::shuffle_idx>},
+
+    // shuffle sometimes emits call into lane_id so lower lane_id after shuffle
+    {gpu_lane_id, B<amdgpu::lane_id>, S<nvvm_read_ptx_sreg_laneid>},
+
+    {gpu_ballot, B<amdgpu::ballot>, B<nvptx::ballot>},
+
+    {gpu_sync_threads, B<amdgpu::sync_threads>, S<nvvm_barrier0>},
+    {gpu_sync_lane, B<amdgpu::sync_lane>, B<nvptx::sync_lane>},
+
+    {gpu_thread_suspend, B<amdgpu::thread_suspend>, B<nvptx::thread_suspend>},
+    {gpu_exit, S<amdgcn_endpgm>, S<nvvm_exit>},
+
+    // These aren't generic intrinsics but lowering them here instead of
+    // in CGBuiltin allows the above to be implemented partly in terms of
+    // amdgcn_grid_size.
+    {amdgcn_grid_size_x, B<amdgpu::grid_size<0>>, nullptr},
+    {amdgcn_grid_size_y, B<amdgpu::grid_size<1>>, nullptr},
+    {amdgcn_grid_size_z, B<amdgpu::grid_size<2>>, nullptr},
+};
+
+class LowerGPUIntrinsic : public ModulePass {
+public:
+  static char ID;
+
+  LowerGPUIntrinsic() : ModulePass(ID) {}
+
+  bool runOnModule(Module &M) override;
+};
+
+bool LowerGPUIntrinsic::runOnModule(Module &M) {
+  bool Changed = false;
+
+  Triple TT(M.getTargetTriple());
+
+  if (!TT.isAMDGPU() && !TT.isNVPTX()) {
+    return Changed;
+  }
+
+  auto &Ctx = M.getContext();
+  IRBuilder<> Builder(Ctx);
+
+  for (const IntrinsicMap &I : ls) {
+    auto *Intr = Intrinsic::getDeclarationIfExists(&M, I.Generic);
+    if (!Intr)
+      continue;
+
+    lowerFunction maybeLowering = TT.isAMDGPU() ? I.AMDGPU : I.NVPTX;
+    if (maybeLowering == nullptr)
+      continue;
+
+    for (auto *U : make_early_inc_range(Intr->users())) {
+      if (auto *CI = dyn_cast<CallBase>(U)) {
+        if (CI->getCalledFunction() == Intr)
+          Changed |= maybeLowering(M, Builder, I.Generic, CI);
+      }
+    }
+  }
+
+  return Changed;
+}
+
+} // namespace
+
+char LowerGPUIntrinsic::ID = 0;
+
+INITIALIZE_PASS(LowerGPUIntrinsic, DEBUG_TYPE, "Lower GPU Intrinsic", false,
+                false)
+
+Pass *llvm::createLowerGPUIntrinsicPass() { return new LowerGPUIntrinsic(); }
+
+PreservedAnalyses LowerGPUIntrinsicPass::run(Module &M,
+                                             ModuleAnalysisManager &) {
+  return LowerGPUIntrinsic().runOnModule(M) ? PreservedAnalyses::none()
+                                            : PreservedAnalyses::all();
+}
diff --git a/llvm/lib/Transforms/Scalar/Scalar.cpp 
b/llvm/lib/Transforms/Scalar/Scalar.cpp
index c7e4a3e824700..66ef7ebfa5fe5 100644
--- a/llvm/lib/Transforms/Scalar/Scalar.cpp
+++ b/llvm/lib/Transforms/Scalar/Scalar.cpp
@@ -34,6 +34,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
   initializeLoopTermFoldPass(Registry);
   initializeLoopUnrollPass(Registry);
   initializeLowerAtomicLegacyPassPass(Registry);
+  initializeLowerGPUIntrinsicPass(Registry);
   initializeMergeICmpsLegacyPassPass(Registry);
   initializeNaryReassociateLegacyPassPass(Registry);
   initializePartiallyInlineLibCallsLegacyPassPass(Registry);
diff --git a/llvm/test/CodeGen/SPIRV/gpu_intrinsics.ll 
b/llvm/test/CodeGen/SPIRV/gpu_intrinsics.ll
new file mode 100644
index 0000000000000..41c14bc323e61
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/gpu_intrinsics.ll
@@ -0,0 +1,427 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py 
UTC_ARGS: --check-attributes
+; RUN: opt -S -mtriple=amdgcn-- -passes=lower-gpu-intrinsic < %s | FileCheck 
%s --check-prefix=AMDGCN
+; RUN: opt -S -mtriple=nvptx64-- -passes=lower-gpu-intrinsic < %s | FileCheck 
%s --check-prefix=NVPTX
+
+; Used by amdgpu to lower llvm.gpu.num.threads, harmless on nvptx
+@__oclc_ABI_version = weak_odr hidden addrspace(4) constant i32 500
+
+define i32 @num_blocks_x() {
+; AMDGCN-LABEL: @num_blocks_x(
+; AMDGCN-NEXT:    [[TMP1:%.*]] = call align 4 dereferenceable(64) ptr 
addrspace(4) @llvm.amdgcn.dispatch.ptr()
+; AMDGCN-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP1]], 
i32 12
+; AMDGCN-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(4) [[TMP2]], align 4, 
!range [[RNG0:![0-9]+]], !invariant.load [[META1:![0-9]+]]
+; AMDGCN-NEXT:    [[TMP12:%.*]] = load i32, ptr addrspace(4) 
@__oclc_ABI_version, align 4
+; AMDGCN-NEXT:    [[TMP13:%.*]] = icmp sge i32 [[TMP12]], 500
+; AMDGCN-NEXT:    [[TMP6:%.*]] = call align 8 dereferenceable(256) ptr 
addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; AMDGCN-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP6]], 
i32 12
+; AMDGCN-NEXT:    [[TMP8:%.*]] = call align 4 dereferenceable(64) ptr 
addrspace(4) @llvm.amdgcn.dispatch.ptr()
+; AMDGCN-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP8]], 
i32 4
+; AMDGCN-NEXT:    [[TMP10:%.*]] = select i1 [[TMP13]], ptr addrspace(4) 
[[TMP7]], ptr addrspace(4) [[TMP9]]
+; AMDGCN-NEXT:    [[TMP11:%.*]] = load i16, ptr addrspace(4) [[TMP10]], align 
2, !invariant.load [[META1]], !noundef [[META1]]
+; AMDGCN-NEXT:    [[TMP4:%.*]] = zext i16 [[TMP11]] to i32
+; AMDGCN-NEXT:    [[TMP5:%.*]] = udiv i32 [[TMP3]], [[TMP4]]
+; AMDGCN-NEXT:    ret i32 [[TMP5]]
+;
+; NVPTX-LABEL: @num_blocks_x(
+; NVPTX-NEXT:    [[TMP1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.x()
+; NVPTX-NEXT:    ret i32 [[TMP1]]
+;
+  %1 = call i32 @llvm.gpu.num.blocks.x()
+  ret i32 %1
+}
+
+declare i32 @llvm.gpu.num.blocks.x()
+
+define i32 @num_blocks_y() {
+; AMDGCN-LABEL: @num_blocks_y(
+; AMDGCN-NEXT:    [[TMP1:%.*]] = call align 4 dereferenceable(64) ptr 
addrspace(4) @llvm.amdgcn.dispatch.ptr()
+; AMDGCN-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP1]], 
i32 16
+; AMDGCN-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(4) [[TMP2]], align 4, 
!range [[RNG0]], !invariant.load [[META1]]
+; AMDGCN-NEXT:    [[TMP12:%.*]] = load i32, ptr addrspace(4) 
@__oclc_ABI_version, align 4
+; AMDGCN-NEXT:    [[TMP13:%.*]] = icmp sge i32 [[TMP12]], 500
+; AMDGCN-NEXT:    [[TMP6:%.*]] = call align 8 dereferenceable(256) ptr 
addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; AMDGCN-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP6]], 
i32 14
+; AMDGCN-NEXT:    [[TMP8:%.*]] = call align 4 dereferenceable(64) ptr 
addrspace(4) @llvm.amdgcn.dispatch.ptr()
+; AMDGCN-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP8]], 
i32 6
+; AMDGCN-NEXT:    [[TMP10:%.*]] = select i1 [[TMP13]], ptr addrspace(4) 
[[TMP7]], ptr addrspace(4) [[TMP9]]
+; AMDGCN-NEXT:    [[TMP11:%.*]] = load i16, ptr addrspace(4) [[TMP10]], align 
2, !invariant.load [[META1]], !noundef [[META1]]
+; AMDGCN-NEXT:    [[TMP4:%.*]] = zext i16 [[TMP11]] to i32
+; AMDGCN-NEXT:    [[TMP5:%.*]] = udiv i32 [[TMP3]], [[TMP4]]
+; AMDGCN-NEXT:    ret i32 [[TMP5]]
+;
+; NVPTX-LABEL: @num_blocks_y(
+; NVPTX-NEXT:    [[TMP1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.y()
+; NVPTX-NEXT:    ret i32 [[TMP1]]
+;
+  %1 = call i32 @llvm.gpu.num.blocks.y()
+  ret i32 %1
+}
+
+declare i32 @llvm.gpu.num.blocks.y()
+
+define i32 @num_blocks_z() {
+; AMDGCN-LABEL: @num_blocks_z(
+; AMDGCN-NEXT:    [[TMP1:%.*]] = call align 4 dereferenceable(64) ptr 
addrspace(4) @llvm.amdgcn.dispatch.ptr()
+; AMDGCN-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP1]], 
i32 20
+; AMDGCN-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(4) [[TMP2]], align 4, 
!range [[RNG0]], !invariant.load [[META1]]
+; AMDGCN-NEXT:    [[TMP12:%.*]] = load i32, ptr addrspace(4) 
@__oclc_ABI_version, align 4
+; AMDGCN-NEXT:    [[TMP13:%.*]] = icmp sge i32 [[TMP12]], 500
+; AMDGCN-NEXT:    [[TMP6:%.*]] = call align 8 dereferenceable(256) ptr 
addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; AMDGCN-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP6]], 
i32 16
+; AMDGCN-NEXT:    [[TMP8:%.*]] = call align 4 dereferenceable(64) ptr 
addrspace(4) @llvm.amdgcn.dispatch.ptr()
+; AMDGCN-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP8]], 
i32 8
+; AMDGCN-NEXT:    [[TMP10:%.*]] = select i1 [[TMP13]], ptr addrspace(4) 
[[TMP7]], ptr addrspace(4) [[TMP9]]
+; AMDGCN-NEXT:    [[TMP11:%.*]] = load i16, ptr addrspace(4) [[TMP10]], align 
2, !invariant.load [[META1]], !noundef [[META1]]
+; AMDGCN-NEXT:    [[TMP4:%.*]] = zext i16 [[TMP11]] to i32
+; AMDGCN-NEXT:    [[TMP5:%.*]] = udiv i32 [[TMP3]], [[TMP4]]
+; AMDGCN-NEXT:    ret i32 [[TMP5]]
+;
+; NVPTX-LABEL: @num_blocks_z(
+; NVPTX-NEXT:    [[TMP1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.z()
+; NVPTX-NEXT:    ret i32 [[TMP1]]
+;
+  %1 = call i32 @llvm.gpu.num.blocks.z()
+  ret i32 %1
+}
+
+declare i32 @llvm.gpu.num.blocks.z()
+
+define i32 @block_id_x() {
+; AMDGCN-LABEL: @block_id_x(
+; AMDGCN-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x()
+; AMDGCN-NEXT:    ret i32 [[TMP1]]
+;
+; NVPTX-LABEL: @block_id_x(
+; NVPTX-NEXT:    [[TMP1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
+; NVPTX-NEXT:    ret i32 [[TMP1]]
+;
+  %1 = call i32 @llvm.gpu.block.id.x()
+  ret i32 %1
+}
+
+declare i32 @llvm.gpu.block.id.x()
+
+define i32 @block_id_y() {
+; AMDGCN-LABEL: @block_id_y(
+; AMDGCN-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y()
+; AMDGCN-NEXT:    ret i32 [[TMP1]]
+;
+; NVPTX-LABEL: @block_id_y(
+; NVPTX-NEXT:    [[TMP1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
+; NVPTX-NEXT:    ret i32 [[TMP1]]
+;
+  %1 = call i32 @llvm.gpu.block.id.y()
+  ret i32 %1
+}
+
+declare i32 @llvm.gpu.block.id.y()
+
+define i32 @block_id_z() {
+; AMDGCN-LABEL: @block_id_z(
+; AMDGCN-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.workgroup.id.z()
+; AMDGCN-NEXT:    ret i32 [[TMP1]]
+;
+; NVPTX-LABEL: @block_id_z(
+; NVPTX-NEXT:    [[TMP1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.z()
+; NVPTX-NEXT:    ret i32 [[TMP1]]
+;
+  %1 = call i32 @llvm.gpu.block.id.z()
+  ret i32 %1
+}
+
+declare i32 @llvm.gpu.block.id.z()
+
+define i32 @num_threads_x() {
+; AMDGCN-LABEL: @num_threads_x(
+; AMDGCN-NEXT:    [[TMP9:%.*]] = load i32, ptr addrspace(4) 
@__oclc_ABI_version, align 4
+; AMDGCN-NEXT:    [[TMP2:%.*]] = icmp sge i32 [[TMP9]], 500
+; AMDGCN-NEXT:    [[TMP3:%.*]] = call align 8 dereferenceable(256) ptr 
addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; AMDGCN-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP3]], 
i32 12
+; AMDGCN-NEXT:    [[TMP5:%.*]] = call align 4 dereferenceable(64) ptr 
addrspace(4) @llvm.amdgcn.dispatch.ptr()
+; AMDGCN-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP5]], 
i32 4
+; AMDGCN-NEXT:    [[TMP7:%.*]] = select i1 [[TMP2]], ptr addrspace(4) 
[[TMP4]], ptr addrspace(4) [[TMP6]]
+; AMDGCN-NEXT:    [[TMP8:%.*]] = load i16, ptr addrspace(4) [[TMP7]], align 2, 
!invariant.load [[META1]], !noundef [[META1]]
+; AMDGCN-NEXT:    [[TMP1:%.*]] = zext i16 [[TMP8]] to i32
+; AMDGCN-NEXT:    ret i32 [[TMP1]]
+;
+; NVPTX-LABEL: @num_threads_x(
+; NVPTX-NEXT:    [[TMP1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+; NVPTX-NEXT:    ret i32 [[TMP1]]
+;
+  %1 = call i32 @llvm.gpu.num.threads.x()
+  ret i32 %1
+}
+
+declare i32 @llvm.gpu.num.threads.x()
+
+define i32 @num_threads_y() {
+; AMDGCN-LABEL: @num_threads_y(
+; AMDGCN-NEXT:    [[TMP9:%.*]] = load i32, ptr addrspace(4) 
@__oclc_ABI_version, align 4
+; AMDGCN-NEXT:    [[TMP2:%.*]] = icmp sge i32 [[TMP9]], 500
+; AMDGCN-NEXT:    [[TMP3:%.*]] = call align 8 dereferenceable(256) ptr 
addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; AMDGCN-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP3]], 
i32 14
+; AMDGCN-NEXT:    [[TMP5:%.*]] = call align 4 dereferenceable(64) ptr 
addrspace(4) @llvm.amdgcn.dispatch.ptr()
+; AMDGCN-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP5]], 
i32 6
+; AMDGCN-NEXT:    [[TMP7:%.*]] = select i1 [[TMP2]], ptr addrspace(4) 
[[TMP4]], ptr addrspace(4) [[TMP6]]
+; AMDGCN-NEXT:    [[TMP8:%.*]] = load i16, ptr addrspace(4) [[TMP7]], align 2, 
!invariant.load [[META1]], !noundef [[META1]]
+; AMDGCN-NEXT:    [[TMP1:%.*]] = zext i16 [[TMP8]] to i32
+; AMDGCN-NEXT:    ret i32 [[TMP1]]
+;
+; NVPTX-LABEL: @num_threads_y(
+; NVPTX-NEXT:    [[TMP1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.y()
+; NVPTX-NEXT:    ret i32 [[TMP1]]
+;
+  %1 = call i32 @llvm.gpu.num.threads.y()
+  ret i32 %1
+}
+
+declare i32 @llvm.gpu.num.threads.y()
+
+define i32 @num_threads_z() {
+; AMDGCN-LABEL: @num_threads_z(
+; AMDGCN-NEXT:    [[TMP9:%.*]] = load i32, ptr addrspace(4) 
@__oclc_ABI_version, align 4
+; AMDGCN-NEXT:    [[TMP2:%.*]] = icmp sge i32 [[TMP9]], 500
+; AMDGCN-NEXT:    [[TMP3:%.*]] = call align 8 dereferenceable(256) ptr 
addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; AMDGCN-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP3]], 
i32 16
+; AMDGCN-NEXT:    [[TMP5:%.*]] = call align 4 dereferenceable(64) ptr 
addrspace(4) @llvm.amdgcn.dispatch.ptr()
+; AMDGCN-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP5]], 
i32 8
+; AMDGCN-NEXT:    [[TMP7:%.*]] = select i1 [[TMP2]], ptr addrspace(4) 
[[TMP4]], ptr addrspace(4) [[TMP6]]
+; AMDGCN-NEXT:    [[TMP8:%.*]] = load i16, ptr addrspace(4) [[TMP7]], align 2, 
!invariant.load [[META1]], !noundef [[META1]]
+; AMDGCN-NEXT:    [[TMP1:%.*]] = zext i16 [[TMP8]] to i32
+; AMDGCN-NEXT:    ret i32 [[TMP1]]
+;
+; NVPTX-LABEL: @num_threads_z(
+; NVPTX-NEXT:    [[TMP1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.z()
+; NVPTX-NEXT:    ret i32 [[TMP1]]
+;
+  %1 = call i32 @llvm.gpu.num.threads.z()
+  ret i32 %1
+}
+
+declare i32 @llvm.gpu.num.threads.z()
+
+define i32 @thread_id_x() {
+; AMDGCN-LABEL: @thread_id_x(
+; AMDGCN-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; AMDGCN-NEXT:    ret i32 [[TMP1]]
+;
+; NVPTX-LABEL: @thread_id_x(
+; NVPTX-NEXT:    [[TMP1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+; NVPTX-NEXT:    ret i32 [[TMP1]]
+;
+  %1 = call i32 @llvm.gpu.thread.id.x()
+  ret i32 %1
+}
+
+declare i32 @llvm.gpu.thread.id.x()
+
+define i32 @thread_id_y() {
+; AMDGCN-LABEL: @thread_id_y(
+; AMDGCN-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; AMDGCN-NEXT:    ret i32 [[TMP1]]
+;
+; NVPTX-LABEL: @thread_id_y(
+; NVPTX-NEXT:    [[TMP1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.y()
+; NVPTX-NEXT:    ret i32 [[TMP1]]
+;
+  %1 = call i32 @llvm.gpu.thread.id.y()
+  ret i32 %1
+}
+
+declare i32 @llvm.gpu.thread.id.y()
+
+define i32 @thread_id_z() {
+; AMDGCN-LABEL: @thread_id_z(
+; AMDGCN-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.z()
+; AMDGCN-NEXT:    ret i32 [[TMP1]]
+;
+; NVPTX-LABEL: @thread_id_z(
+; NVPTX-NEXT:    [[TMP1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.z()
+; NVPTX-NEXT:    ret i32 [[TMP1]]
+;
+  %1 = call i32 @llvm.gpu.thread.id.z()
+  ret i32 %1
+}
+
+declare i32 @llvm.gpu.thread.id.z()
+
+define i32 @num_lanes() {
+; AMDGCN-LABEL: @num_lanes(
+; AMDGCN-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.wavefrontsize()
+; AMDGCN-NEXT:    ret i32 [[TMP1]]
+;
+; NVPTX-LABEL: @num_lanes(
+; NVPTX-NEXT:    [[TMP1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+; NVPTX-NEXT:    ret i32 [[TMP1]]
+;
+  %1 = call i32 @llvm.gpu.num.lanes()
+  ret i32 %1
+}
+
+declare i32 @llvm.gpu.num.lanes()
+
+define i32 @lane_id() {
+; AMDGCN-LABEL: @lane_id(
+; AMDGCN-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+; AMDGCN-NEXT:    [[TMP2:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 
[[TMP1]])
+; AMDGCN-NEXT:    ret i32 [[TMP2]]
+;
+; NVPTX-LABEL: @lane_id(
+; NVPTX-NEXT:    [[TMP1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.laneid()
+; NVPTX-NEXT:    ret i32 [[TMP1]]
+;
+  %1 = call i32 @llvm.gpu.lane.id()
+  ret i32 %1
+}
+
+declare i32 @llvm.gpu.lane.id()
+
+define i64 @lane_mask() {
+; AMDGCN-LABEL: @lane_mask(
+; AMDGCN-NEXT:    [[TMP1:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; AMDGCN-NEXT:    ret i64 [[TMP1]]
+;
+; NVPTX-LABEL: @lane_mask(
+; NVPTX-NEXT:    [[TMP1:%.*]] = call i32 @llvm.nvvm.activemask()
+; NVPTX-NEXT:    [[CONV:%.*]] = zext i32 [[TMP1]] to i64
+; NVPTX-NEXT:    ret i64 [[CONV]]
+;
+  %1 = call i64 @llvm.gpu.lane.mask()
+  ret i64 %1
+}
+
+declare i64 @llvm.gpu.lane.mask()
+
+define i32 @read_first_lane_u32(i64 %lane_mask, i32 %x)  {
+; AMDGCN-LABEL: @read_first_lane_u32(
+; AMDGCN-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 
[[X:%.*]])
+; AMDGCN-NEXT:    ret i32 [[TMP1]]
+;
+; NVPTX-LABEL: @read_first_lane_u32(
+; NVPTX-NEXT:    [[CONV:%.*]] = trunc i64 [[LANE_MASK:%.*]] to i32
+; NVPTX-NEXT:    [[TMP1:%.*]] = call i32 @llvm.cttz.i32(i32 [[CONV]], i1 true)
+; NVPTX-NEXT:    [[ISZERO:%.*]] = icmp eq i32 [[CONV]], 0
+; NVPTX-NEXT:    [[SUB:%.*]] = select i1 [[ISZERO]], i32 -1, i32 [[TMP1]]
+; NVPTX-NEXT:    [[TMP2:%.*]] = call i32 @llvm.nvvm.shfl.sync.idx.i32(i32 
[[CONV]], i32 [[X:%.*]], i32 [[SUB]], i32 31)
+; NVPTX-NEXT:    ret i32 [[TMP2]]
+;
+  %1 = call i32 @llvm.gpu.read.first.lane.u32(i64 %lane_mask, i32 %x)
+  ret i32 %1
+}
+
+declare i32 @llvm.gpu.read.first.lane.u32(i64, i32)
+
+define i64 @ballot(i64 %lane_mask, i1 zeroext %x)  {
+; AMDGCN-LABEL: @ballot(
+; AMDGCN-NEXT:    [[TMP1:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[X:%.*]])
+; AMDGCN-NEXT:    [[TMP2:%.*]] = and i64 [[TMP1]], [[LANE_MASK:%.*]]
+; AMDGCN-NEXT:    ret i64 [[TMP2]]
+;
+; NVPTX-LABEL: @ballot(
+; NVPTX-NEXT:    [[CONV:%.*]] = trunc i64 [[LANE_MASK:%.*]] to i32
+; NVPTX-NEXT:    [[TMP1:%.*]] = call i32 @llvm.nvvm.vote.ballot.sync(i32 
[[CONV]], i1 [[X:%.*]])
+; NVPTX-NEXT:    [[CONV1:%.*]] = zext i32 [[TMP1]] to i64
+; NVPTX-NEXT:    ret i64 [[CONV1]]
+;
+  %1 = call i64 @llvm.gpu.ballot(i64 %lane_mask, i1 %x)
+  ret i64 %1
+}
+
+declare i64 @llvm.gpu.ballot(i64, i1)
+
+define void @sync_threads() {
+; AMDGCN-LABEL: @sync_threads(
+; AMDGCN-NEXT:    call void @llvm.amdgcn.s.barrier()
+; AMDGCN-NEXT:    fence syncscope("workgroup") seq_cst
+; AMDGCN-NEXT:    ret void
+;
+; NVPTX-LABEL: @sync_threads(
+; NVPTX-NEXT:    call void @llvm.nvvm.barrier0()
+; NVPTX-NEXT:    ret void
+;
+  call void @llvm.gpu.sync.threads()
+  ret void
+}
+
+declare void @llvm.gpu.sync.threads()
+
+define void @sync_lane(i64 %lane_mask) {
+; AMDGCN-LABEL: @sync_lane(
+; AMDGCN-NEXT:    call void @llvm.amdgcn.wave.barrier()
+; AMDGCN-NEXT:    ret void
+;
+; NVPTX-LABEL: @sync_lane(
+; NVPTX-NEXT:    [[TMP1:%.*]] = trunc i64 [[LANE_MASK:%.*]] to i32
+; NVPTX-NEXT:    call void @llvm.nvvm.bar.warp.sync(i32 [[TMP1]])
+; NVPTX-NEXT:    ret void
+;
+  call void @llvm.gpu.sync.lane(i64 %lane_mask)
+  ret void
+}
+
+declare void @llvm.gpu.sync.lane(i64)
+
+define i32 @shuffle_idx_u32(i64 %lane_mask, i32 %idx, i32 %x, i32 %width)  {
+; AMDGCN-LABEL: @shuffle_idx_u32(
+; AMDGCN-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+; AMDGCN-NEXT:    [[TMP2:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 
[[TMP1]])
+; AMDGCN-NEXT:    [[NOT:%.*]] = sub i32 0, [[WIDTH:%.*]]
+; AMDGCN-NEXT:    [[AND:%.*]] = and i32 [[TMP2]], [[NOT]]
+; AMDGCN-NEXT:    [[ADD:%.*]] = add i32 [[AND]], [[IDX:%.*]]
+; AMDGCN-NEXT:    [[SHL:%.*]] = shl i32 [[ADD]], 2
+; AMDGCN-NEXT:    [[TMP3:%.*]] = call i32 @llvm.amdgcn.ds.bpermute(i32 
[[SHL]], i32 [[X:%.*]])
+; AMDGCN-NEXT:    ret i32 [[TMP3]]
+;
+; NVPTX-LABEL: @shuffle_idx_u32(
+; NVPTX-NEXT:    [[CONV:%.*]] = trunc i64 [[LANE_MASK:%.*]] to i32
+; NVPTX-NEXT:    [[SH_PROM:%.*]] = zext i32 [[IDX:%.*]] to i64
+; NVPTX-NEXT:    [[TMP1:%.*]] = shl i32 [[WIDTH:%.*]], 8
+; NVPTX-NEXT:    [[OR:%.*]] = sub i32 8223, [[TMP1]]
+; NVPTX-NEXT:    [[TMP2:%.*]] = call i32 @llvm.nvvm.shfl.sync.idx.i32(i32 
[[CONV]], i32 [[X:%.*]], i32 [[IDX]], i32 [[OR]])
+; NVPTX-NEXT:    [[TMP3:%.*]] = shl i64 1, [[SH_PROM]]
+; NVPTX-NEXT:    [[TMP4:%.*]] = and i64 [[TMP3]], [[LANE_MASK]]
+; NVPTX-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[TMP4]], 0
+; NVPTX-NEXT:    [[AND4:%.*]] = select i1 [[TMP5]], i32 0, i32 [[TMP2]]
+; NVPTX-NEXT:    ret i32 [[AND4]]
+;
+  %1 = call i32 @llvm.gpu.shuffle.idx.u32(i64 %lane_mask, i32 %idx, i32 %x, 
i32 %width)
+  ret i32 %1
+}
+
+declare i32 @llvm.gpu.shuffle.idx.u32(i64, i32, i32, i32)
+
+define void @gpu_exit() {
+; AMDGCN-LABEL: @gpu_exit(
+; AMDGCN-NEXT:    call void @llvm.amdgcn.endpgm()
+; AMDGCN-NEXT:    ret void
+;
+; NVPTX-LABEL: @gpu_exit(
+; NVPTX-NEXT:    call void @llvm.nvvm.exit()
+; NVPTX-NEXT:    ret void
+;
+  call void @llvm.gpu.exit()
+  ret void
+}
+
+declare void @llvm.gpu.exit()
+
+define void @thread_suspend() {
+; AMDGCN-LABEL: @thread_suspend(
+; AMDGCN-NEXT:    call void @llvm.amdgcn.s.sleep(i32 2)
+; AMDGCN-NEXT:    ret void
+;
+; NVPTX-LABEL: @thread_suspend(
+; NVPTX-NEXT:    [[TMP1:%.*]] = call i32 @llvm.nvvm.reflect(ptr 
@[[GLOB0:[0-9]+]])
+; NVPTX-NEXT:    [[TMP2:%.*]] = icmp ugt i32 [[TMP1]], 699
+; NVPTX-NEXT:    br i1 [[TMP2]], label [[TMP3:%.*]], label [[TMP4:%.*]]
+; NVPTX:       3:
+; NVPTX-NEXT:    call void @llvm.nvvm.nanosleep(i32 64)
+; NVPTX-NEXT:    br label [[TMP4]]
+; NVPTX:       4:
+; NVPTX-NEXT:    ret void
+;
+  call void @llvm.gpu.thread.suspend()
+  ret void
+}
+
+declare void @llvm.gpu.thread.suspend()

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [llvm] [SPIRV] GPU intrinsics (PR #131190)

Reply via email to