[clang] [llvm] [SPIRV] GPU intrinsics (PR #131190)

Joseph Huber via cfe-commits Thu, 13 Mar 2025 12:13:25 -0700

================
@@ -0,0 +1,501 @@
+//===- LowerGPUIntrinsic.cpp 
----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Lower the llvm.gpu intrinsics to target specific code sequences.
+// Can be called from clang if building for a specific GPU or from the backend
+// as part of a SPIRV lowering pipeline. Initial pass can lower to amdgcn or
+// nvptx, adding further architectures means adding a column to the lookup 
table
+// and further intrinsics adding a row.
+//
+// The idea is for the intrinsics to represent a thin abstraction over the
+// different GPU architectures. In particular, code compiled to spirv-- without
+// specifying a specific target can be specialised at JIT time, at which point
+// this pass will rewrite those intrinsics to ones that the current backend
+// knows.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LowerGPUIntrinsic.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/ConstantRange.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/IR/IntrinsicsNVPTX.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/TargetParser/Triple.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+#define DEBUG_TYPE "lower-gpu-intrinsic"
+
+using namespace llvm;
+
+namespace {
+
+// For each intrinsic, specify what function to call to lower it
+typedef bool (*lowerFunction)(Module &M, IRBuilder<> &, Intrinsic::ID from,
+                              CallBase *CI);
+
+// Simple lowering, directly replace the intrinsic with a different one
+// with the same type, and optionally refine range metadata on the return value
+template <Intrinsic::ID To>
+bool S(Module &M, IRBuilder<> &, Intrinsic::ID from, CallBase *CI) {
+
+  static_assert(To != Intrinsic::not_intrinsic);
+  Intrinsic::ID GenericID = from;
+  Intrinsic::ID SpecificID = To;
+
+  bool Changed = false;
+  Function *Generic = Intrinsic::getDeclarationIfExists(&M, GenericID);
+  auto *Specific = Intrinsic::getOrInsertDeclaration(&M, SpecificID);
+
+  if ((Generic->getType() != Specific->getType()) ||
+      (Generic->getReturnType() != Specific->getReturnType()))
+    report_fatal_error("LowerGPUIntrinsic: Inconsistent types between "
+                       "intrinsics in lookup table");
+
+  CI->setCalledFunction(Specific);
+  Changed = true;
+
+  return Changed;
+}
+
+// Replace intrinsic call with a linear sequence of instructions
+typedef Value *(*builder)(Module &M, IRBuilder<> &Builder, Intrinsic::ID from,
+                          CallBase *CI);
+
+template <builder F>
+bool B(Module &M, IRBuilder<> &Builder, Intrinsic::ID from, CallBase *CI) {
+  bool Changed = false;
+
+  Builder.SetInsertPoint(CI);
+
+  Value *replacement = F(M, Builder, from, CI);
+  if (replacement) {
+    CI->replaceAllUsesWith(replacement);
+    CI->eraseFromParent();
+    Changed = true;
+  }
+
+  return Changed;
+}
+
+template <Intrinsic::ID Numerator, Intrinsic::ID Denominator>
+Value *intrinsicRatio(Module &M, IRBuilder<> &Builder, Intrinsic::ID,
+                      CallBase *) {
+  Value *N = Builder.CreateIntrinsic(Numerator, {}, {});
+  Value *D = Builder.CreateIntrinsic(Denominator, {}, {});
+  return Builder.CreateUDiv(N, D);
+}
+
+namespace amdgpu {
+Value *lane_mask(Module &M, IRBuilder<> &Builder, Intrinsic::ID from,
+                 CallBase *CI) {
+  auto &Ctx = M.getContext();
+  return Builder.CreateIntrinsic(
+      Intrinsic::amdgcn_ballot, {Type::getInt64Ty(Ctx)},
+      {ConstantInt::get(Type::getInt1Ty(Ctx), true)});
+}
+
+Value *lane_id(Module &M, IRBuilder<> &Builder, Intrinsic::ID from,
+               CallBase *CI) {
+  auto &Ctx = M.getContext();
+  Constant *M1 = ConstantInt::get(Type::getInt32Ty(Ctx), -1);
+  Constant *Z = ConstantInt::get(Type::getInt32Ty(Ctx), 0);
+
+  CallInst *Lo =
+      Builder.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {}, {M1, Z});
+  return Builder.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {}, {M1, Lo});
+}
+
+Value *first_lane(Module &M, IRBuilder<> &Builder, Intrinsic::ID from,
+                  CallBase *CI) {
+  auto &Ctx = M.getContext();
+  return Builder.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane,
+                                 {Type::getInt32Ty(Ctx)},
+                                 {CI->getArgOperand(1)});
+}
+
+Value *shuffle_idx(Module &M, IRBuilder<> &Builder, Intrinsic::ID from,
+                   CallBase *CI) {
+  auto &Ctx = M.getContext();
+
+  Value *idx = CI->getArgOperand(1);
+  Value *x = CI->getArgOperand(2);
+  Value *width = CI->getArgOperand(3);
+
+  Value *id = Builder.CreateIntrinsic(Intrinsic::gpu_lane_id, {}, {});
+
+  Value *n = Builder.CreateSub(ConstantInt::get(Type::getInt32Ty(Ctx), 0),
+                               width, "not");
+  Value *a = Builder.CreateAnd(id, n, "and");
+  Value *add = Builder.CreateAdd(a, idx, "add");
+  Value *shl =
+      Builder.CreateShl(add, ConstantInt::get(Type::getInt32Ty(Ctx), 2), 
"shl");
+  return Builder.CreateIntrinsic(Intrinsic::amdgcn_ds_bpermute, {}, {shl, x});
+}
+
+Value *ballot(Module &M, IRBuilder<> &Builder, Intrinsic::ID from,
+              CallBase *CI) {
+  auto &Ctx = M.getContext();
+
+  Value *C =
+      Builder.CreateIntrinsic(Intrinsic::amdgcn_ballot, 
{Type::getInt64Ty(Ctx)},
+                              {CI->getArgOperand(1)});
+
+  return Builder.CreateAnd(C, CI->getArgOperand(0));
+}
+
+Value *sync_threads(Module &M, IRBuilder<> &Builder, Intrinsic::ID from,
+                    CallBase *CI) {
+  auto &Ctx = M.getContext();
+  Builder.CreateIntrinsic(Intrinsic::amdgcn_s_barrier, {}, {});
+
+  Value *F = Builder.CreateFence(AtomicOrdering::SequentiallyConsistent,
+                                 Ctx.getOrInsertSyncScopeID("workgroup"));
+
+  return F;
+}
+
+Value *sync_lane(Module &M, IRBuilder<> &Builder, Intrinsic::ID from,
+                 CallBase *CI) {
+  return Builder.CreateIntrinsic(Intrinsic::amdgcn_wave_barrier, {}, {});
+}
+
+Value *thread_suspend(Module &M, IRBuilder<> &Builder, Intrinsic::ID from,
+                      CallBase *CI) {
+
+  auto &Ctx = M.getContext();
+  return Builder.CreateIntrinsic(Intrinsic::amdgcn_s_sleep, {},
+                                 {ConstantInt::get(Type::getInt32Ty(Ctx), 2)});
+}
+
+Value *dispatch_ptr(IRBuilder<> &Builder) {
+  CallInst *Call =
+      Builder.CreateIntrinsic(Intrinsic::amdgcn_dispatch_ptr, {}, {});
+  Call->addRetAttr(
+      Attribute::getWithDereferenceableBytes(Call->getContext(), 64));
+  Call->addRetAttr(Attribute::getWithAlignment(Call->getContext(), Align(4)));
+  return Call;
+}
+
+Value *implicit_arg_ptr(IRBuilder<> &Builder) {
+  CallInst *Call =
+      Builder.CreateIntrinsic(Intrinsic::amdgcn_implicitarg_ptr, {}, {});
+  Call->addRetAttr(
+      Attribute::getWithDereferenceableBytes(Call->getContext(), 256));
+  Call->addRetAttr(Attribute::getWithAlignment(Call->getContext(), Align(8)));
+  return Call;
+}
+
+template <unsigned Index>
+Value *grid_size(Module &M, IRBuilder<> &Builder, Intrinsic::ID, CallBase *) {
+  auto &Ctx = M.getContext();
+  const unsigned XOffset = 12;
+  auto *DP = dispatch_ptr(Builder);
+
+  // Indexing the HSA kernel_dispatch_packet struct.
+  auto *Offset = ConstantInt::get(Type::getInt32Ty(Ctx), XOffset + Index * 4);
+  auto *GEP = Builder.CreateGEP(Type::getInt8Ty(Ctx), DP, Offset);
+  auto *LD = Builder.CreateLoad(Type::getInt32Ty(Ctx), GEP);
+  llvm::MDBuilder MDB(Ctx);
+  // Known non-zero.
+  LD->setMetadata(llvm::LLVMContext::MD_range,
+                  MDB.createRange(APInt(32, 1), APInt::getZero(32)));
+  LD->setMetadata(llvm::LLVMContext::MD_invariant_load,
+                  llvm::MDNode::get(Ctx, {}));
+  return LD;
+}
+
+template <int Index>
+Value *WGSize(Module &M, IRBuilder<> &Builder, Intrinsic::ID from,
+              CallBase *CI) {
+
+  // Note: "__oclc_ABI_version" is supposed to be emitted and initialized by
+  //       clang during compilation of user code.
+  StringRef Name = "__oclc_ABI_version";
----------------
jhuber6 wrote:


I hate this thing.

https://github.com/llvm/llvm-project/pull/131190
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [llvm] [SPIRV] GPU intrinsics (PR #131190)

Reply via email to