[clang] [llvm] [SPIRV] GPU intrinsics (PR #131190)

Jon Chesterfield via cfe-commits Mon, 17 Mar 2025 05:30:15 -0700

================
@@ -0,0 +1,501 @@
+//===- LowerGPUIntrinsic.cpp 
----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Lower the llvm.gpu intrinsics to target specific code sequences.
+// Can be called from clang if building for a specific GPU or from the backend
+// as part of a SPIRV lowering pipeline. Initial pass can lower to amdgcn or
+// nvptx, adding further architectures means adding a column to the lookup 
table
+// and further intrinsics adding a row.
+//
+// The idea is for the intrinsics to represent a thin abstraction over the
+// different GPU architectures. In particular, code compiled to spirv-- without
+// specifying a specific target can be specialised at JIT time, at which point
+// this pass will rewrite those intrinsics to ones that the current backend
+// knows.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LowerGPUIntrinsic.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/ConstantRange.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/IR/IntrinsicsNVPTX.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/TargetParser/Triple.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+#define DEBUG_TYPE "lower-gpu-intrinsic"
+
+using namespace llvm;
+
+namespace {
+
+// For each intrinsic, specify what function to call to lower it
+typedef bool (*lowerFunction)(Module &M, IRBuilder<> &, Intrinsic::ID from,
+                              CallBase *CI);
+
+// Simple lowering, directly replace the intrinsic with a different one
+// with the same type, and optionally refine range metadata on the return value
+template <Intrinsic::ID To>
+bool S(Module &M, IRBuilder<> &, Intrinsic::ID from, CallBase *CI) {
+
+  static_assert(To != Intrinsic::not_intrinsic);
+  Intrinsic::ID GenericID = from;
+  Intrinsic::ID SpecificID = To;
+
+  bool Changed = false;
+  Function *Generic = Intrinsic::getDeclarationIfExists(&M, GenericID);
+  auto *Specific = Intrinsic::getOrInsertDeclaration(&M, SpecificID);
+
+  if ((Generic->getType() != Specific->getType()) ||
+      (Generic->getReturnType() != Specific->getReturnType()))
+    report_fatal_error("LowerGPUIntrinsic: Inconsistent types between "
+                       "intrinsics in lookup table");
+
+  CI->setCalledFunction(Specific);
+  Changed = true;
+
+  return Changed;
+}
+
+// Replace intrinsic call with a linear sequence of instructions
+typedef Value *(*builder)(Module &M, IRBuilder<> &Builder, Intrinsic::ID from,
+                          CallBase *CI);
+
+template <builder F>
+bool B(Module &M, IRBuilder<> &Builder, Intrinsic::ID from, CallBase *CI) {
+  bool Changed = false;
+
+  Builder.SetInsertPoint(CI);
+
+  Value *replacement = F(M, Builder, from, CI);
+  if (replacement) {
+    CI->replaceAllUsesWith(replacement);
+    CI->eraseFromParent();
+    Changed = true;
+  }
+
+  return Changed;
+}
+
+template <Intrinsic::ID Numerator, Intrinsic::ID Denominator>
+Value *intrinsicRatio(Module &M, IRBuilder<> &Builder, Intrinsic::ID,
+                      CallBase *) {
+  Value *N = Builder.CreateIntrinsic(Numerator, {}, {});
+  Value *D = Builder.CreateIntrinsic(Denominator, {}, {});
+  return Builder.CreateUDiv(N, D);
+}
+
+namespace amdgpu {
+Value *lane_mask(Module &M, IRBuilder<> &Builder, Intrinsic::ID from,
+                 CallBase *CI) {
+  auto &Ctx = M.getContext();
+  return Builder.CreateIntrinsic(
+      Intrinsic::amdgcn_ballot, {Type::getInt64Ty(Ctx)},
+      {ConstantInt::get(Type::getInt1Ty(Ctx), true)});
+}
+
+Value *lane_id(Module &M, IRBuilder<> &Builder, Intrinsic::ID from,
+               CallBase *CI) {
+  auto &Ctx = M.getContext();
+  Constant *M1 = ConstantInt::get(Type::getInt32Ty(Ctx), -1);
+  Constant *Z = ConstantInt::get(Type::getInt32Ty(Ctx), 0);
+
+  CallInst *Lo =
+      Builder.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {}, {M1, Z});
+  return Builder.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {}, {M1, Lo});
+}
+
+Value *first_lane(Module &M, IRBuilder<> &Builder, Intrinsic::ID from,
+                  CallBase *CI) {
+  auto &Ctx = M.getContext();
+  return Builder.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane,
+                                 {Type::getInt32Ty(Ctx)},
+                                 {CI->getArgOperand(1)});
+}
+
+Value *shuffle_idx(Module &M, IRBuilder<> &Builder, Intrinsic::ID from,
+                   CallBase *CI) {
+  auto &Ctx = M.getContext();
+
+  Value *idx = CI->getArgOperand(1);
+  Value *x = CI->getArgOperand(2);
+  Value *width = CI->getArgOperand(3);
+
+  Value *id = Builder.CreateIntrinsic(Intrinsic::gpu_lane_id, {}, {});
+
+  Value *n = Builder.CreateSub(ConstantInt::get(Type::getInt32Ty(Ctx), 0),
+                               width, "not");
+  Value *a = Builder.CreateAnd(id, n, "and");
+  Value *add = Builder.CreateAdd(a, idx, "add");
+  Value *shl =
+      Builder.CreateShl(add, ConstantInt::get(Type::getInt32Ty(Ctx), 2), 
"shl");
+  return Builder.CreateIntrinsic(Intrinsic::amdgcn_ds_bpermute, {}, {shl, x});
+}
+
+Value *ballot(Module &M, IRBuilder<> &Builder, Intrinsic::ID from,
+              CallBase *CI) {
+  auto &Ctx = M.getContext();
+
+  Value *C =
+      Builder.CreateIntrinsic(Intrinsic::amdgcn_ballot, 
{Type::getInt64Ty(Ctx)},
+                              {CI->getArgOperand(1)});
+
+  return Builder.CreateAnd(C, CI->getArgOperand(0));
+}
+
+Value *sync_threads(Module &M, IRBuilder<> &Builder, Intrinsic::ID from,
+                    CallBase *CI) {
+  auto &Ctx = M.getContext();
+  Builder.CreateIntrinsic(Intrinsic::amdgcn_s_barrier, {}, {});
+
+  Value *F = Builder.CreateFence(AtomicOrdering::SequentiallyConsistent,
+                                 Ctx.getOrInsertSyncScopeID("workgroup"));
+
+  return F;
+}
+
+Value *sync_lane(Module &M, IRBuilder<> &Builder, Intrinsic::ID from,
+                 CallBase *CI) {
+  return Builder.CreateIntrinsic(Intrinsic::amdgcn_wave_barrier, {}, {});
+}
+
+Value *thread_suspend(Module &M, IRBuilder<> &Builder, Intrinsic::ID from,
+                      CallBase *CI) {
+
+  auto &Ctx = M.getContext();
+  return Builder.CreateIntrinsic(Intrinsic::amdgcn_s_sleep, {},
+                                 {ConstantInt::get(Type::getInt32Ty(Ctx), 2)});
+}
+
+Value *dispatch_ptr(IRBuilder<> &Builder) {
+  CallInst *Call =
+      Builder.CreateIntrinsic(Intrinsic::amdgcn_dispatch_ptr, {}, {});
+  Call->addRetAttr(
+      Attribute::getWithDereferenceableBytes(Call->getContext(), 64));
+  Call->addRetAttr(Attribute::getWithAlignment(Call->getContext(), Align(4)));
+  return Call;
+}
+
+Value *implicit_arg_ptr(IRBuilder<> &Builder) {
+  CallInst *Call =
+      Builder.CreateIntrinsic(Intrinsic::amdgcn_implicitarg_ptr, {}, {});
+  Call->addRetAttr(
+      Attribute::getWithDereferenceableBytes(Call->getContext(), 256));
+  Call->addRetAttr(Attribute::getWithAlignment(Call->getContext(), Align(8)));
+  return Call;
+}
+
+template <unsigned Index>
+Value *grid_size(Module &M, IRBuilder<> &Builder, Intrinsic::ID, CallBase *) {
+  auto &Ctx = M.getContext();
+  const unsigned XOffset = 12;
+  auto *DP = dispatch_ptr(Builder);
+
+  // Indexing the HSA kernel_dispatch_packet struct.
+  auto *Offset = ConstantInt::get(Type::getInt32Ty(Ctx), XOffset + Index * 4);
+  auto *GEP = Builder.CreateGEP(Type::getInt8Ty(Ctx), DP, Offset);
+  auto *LD = Builder.CreateLoad(Type::getInt32Ty(Ctx), GEP);
+  llvm::MDBuilder MDB(Ctx);
+  // Known non-zero.
+  LD->setMetadata(llvm::LLVMContext::MD_range,
+                  MDB.createRange(APInt(32, 1), APInt::getZero(32)));
+  LD->setMetadata(llvm::LLVMContext::MD_invariant_load,
+                  llvm::MDNode::get(Ctx, {}));
+  return LD;
+}
+
+template <int Index>
+Value *WGSize(Module &M, IRBuilder<> &Builder, Intrinsic::ID from,
+              CallBase *CI) {
+
+  // Note: "__oclc_ABI_version" is supposed to be emitted and initialized by
+  //       clang during compilation of user code.
+  StringRef Name = "__oclc_ABI_version";
+  auto *ABIVersionC = M.getNamedGlobal(Name);
+  if (!ABIVersionC) {
+    // In CGBuiltin, we'd have to create an extern variable to emit the load 
for
+    // Here, we can leave the intrinsic in place and it'll get lowered later
+    return nullptr;
+  }
----------------
JonChesterfield wrote:


It isn't. That's what the return branch means. No magic variable saying where 
the value is to be found, leave the intrinsic in place until that variable 
shows up later. If it never shows up, well, we haven't chosen a code object 
format and can't codegen.

https://github.com/llvm/llvm-project/pull/131190
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [llvm] [SPIRV] GPU intrinsics (PR #131190)

Reply via email to