================ @@ -0,0 +1,501 @@ +//===- LowerGPUIntrinsic.cpp ----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Lower the llvm.gpu intrinsics to target specific code sequences. +// Can be called from clang if building for a specific GPU or from the backend +// as part of a SPIRV lowering pipeline. Initial pass can lower to amdgcn or +// nvptx, adding further architectures means adding a column to the lookup table +// and further intrinsics adding a row. +// +// The idea is for the intrinsics to represent a thin abstraction over the +// different GPU architectures. In particular, code compiled to spirv-- without +// specifying a specific target can be specialised at JIT time, at which point +// this pass will rewrite those intrinsics to ones that the current backend +// knows. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Scalar/LowerGPUIntrinsic.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/IR/ConstantRange.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InlineAsm.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/IntrinsicsAMDGPU.h" +#include "llvm/IR/IntrinsicsNVPTX.h" +#include "llvm/IR/MDBuilder.h" +#include "llvm/IR/Module.h" +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" +#include "llvm/Target/TargetOptions.h" +#include "llvm/TargetParser/Triple.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" + +#define DEBUG_TYPE "lower-gpu-intrinsic" + +using namespace llvm; + +namespace { + +// For each intrinsic, specify what function to call to lower it +typedef bool (*lowerFunction)(Module &M, IRBuilder<> &, Intrinsic::ID from, + CallBase *CI); + +// Simple lowering, directly replace the intrinsic with a different one +// with the same type, and optionally refine range metadata on the return value +template <Intrinsic::ID To> +bool S(Module &M, IRBuilder<> &, Intrinsic::ID from, CallBase *CI) { + + static_assert(To != Intrinsic::not_intrinsic); + Intrinsic::ID GenericID = from; + Intrinsic::ID SpecificID = To; + + bool Changed = false; + Function *Generic = Intrinsic::getDeclarationIfExists(&M, GenericID); + auto *Specific = Intrinsic::getOrInsertDeclaration(&M, SpecificID); + + if ((Generic->getType() != Specific->getType()) || + (Generic->getReturnType() != Specific->getReturnType())) + report_fatal_error("LowerGPUIntrinsic: Inconsistent types between " + "intrinsics in lookup table"); + + CI->setCalledFunction(Specific); + Changed = true; + + return Changed; +} + +// Replace intrinsic call with a linear sequence of instructions +typedef Value *(*builder)(Module &M, IRBuilder<> &Builder, Intrinsic::ID from, + CallBase *CI); + +template <builder F> +bool B(Module &M, IRBuilder<> &Builder, Intrinsic::ID from, CallBase *CI) { + bool Changed = false; + + Builder.SetInsertPoint(CI); + + Value *replacement = F(M, Builder, from, CI); + if (replacement) { + CI->replaceAllUsesWith(replacement); + CI->eraseFromParent(); + Changed = true; + } + + return Changed; +} + +template <Intrinsic::ID Numerator, Intrinsic::ID Denominator> +Value *intrinsicRatio(Module &M, IRBuilder<> &Builder, Intrinsic::ID, + CallBase *) { + Value *N = Builder.CreateIntrinsic(Numerator, {}, {}); + Value *D = Builder.CreateIntrinsic(Denominator, {}, {}); + return Builder.CreateUDiv(N, D); +} + +namespace amdgpu { +Value *lane_mask(Module &M, IRBuilder<> &Builder, Intrinsic::ID from, + CallBase *CI) { + auto &Ctx = M.getContext(); + return Builder.CreateIntrinsic( + Intrinsic::amdgcn_ballot, {Type::getInt64Ty(Ctx)}, + {ConstantInt::get(Type::getInt1Ty(Ctx), true)}); +} + +Value *lane_id(Module &M, IRBuilder<> &Builder, Intrinsic::ID from, + CallBase *CI) { + auto &Ctx = M.getContext(); + Constant *M1 = ConstantInt::get(Type::getInt32Ty(Ctx), -1); + Constant *Z = ConstantInt::get(Type::getInt32Ty(Ctx), 0); + + CallInst *Lo = + Builder.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {}, {M1, Z}); + return Builder.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {}, {M1, Lo}); +} + +Value *first_lane(Module &M, IRBuilder<> &Builder, Intrinsic::ID from, + CallBase *CI) { + auto &Ctx = M.getContext(); + return Builder.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, + {Type::getInt32Ty(Ctx)}, + {CI->getArgOperand(1)}); +} + +Value *shuffle_idx(Module &M, IRBuilder<> &Builder, Intrinsic::ID from, + CallBase *CI) { + auto &Ctx = M.getContext(); + + Value *idx = CI->getArgOperand(1); + Value *x = CI->getArgOperand(2); + Value *width = CI->getArgOperand(3); + + Value *id = Builder.CreateIntrinsic(Intrinsic::gpu_lane_id, {}, {}); + + Value *n = Builder.CreateSub(ConstantInt::get(Type::getInt32Ty(Ctx), 0), + width, "not"); + Value *a = Builder.CreateAnd(id, n, "and"); + Value *add = Builder.CreateAdd(a, idx, "add"); + Value *shl = + Builder.CreateShl(add, ConstantInt::get(Type::getInt32Ty(Ctx), 2), "shl"); + return Builder.CreateIntrinsic(Intrinsic::amdgcn_ds_bpermute, {}, {shl, x}); +} + +Value *ballot(Module &M, IRBuilder<> &Builder, Intrinsic::ID from, + CallBase *CI) { + auto &Ctx = M.getContext(); + + Value *C = + Builder.CreateIntrinsic(Intrinsic::amdgcn_ballot, {Type::getInt64Ty(Ctx)}, + {CI->getArgOperand(1)}); + + return Builder.CreateAnd(C, CI->getArgOperand(0)); +} + +Value *sync_threads(Module &M, IRBuilder<> &Builder, Intrinsic::ID from, + CallBase *CI) { + auto &Ctx = M.getContext(); + Builder.CreateIntrinsic(Intrinsic::amdgcn_s_barrier, {}, {}); + + Value *F = Builder.CreateFence(AtomicOrdering::SequentiallyConsistent, + Ctx.getOrInsertSyncScopeID("workgroup")); + + return F; +} + +Value *sync_lane(Module &M, IRBuilder<> &Builder, Intrinsic::ID from, + CallBase *CI) { + return Builder.CreateIntrinsic(Intrinsic::amdgcn_wave_barrier, {}, {}); +} + +Value *thread_suspend(Module &M, IRBuilder<> &Builder, Intrinsic::ID from, + CallBase *CI) { + + auto &Ctx = M.getContext(); + return Builder.CreateIntrinsic(Intrinsic::amdgcn_s_sleep, {}, + {ConstantInt::get(Type::getInt32Ty(Ctx), 2)}); +} + +Value *dispatch_ptr(IRBuilder<> &Builder) { + CallInst *Call = + Builder.CreateIntrinsic(Intrinsic::amdgcn_dispatch_ptr, {}, {}); + Call->addRetAttr( + Attribute::getWithDereferenceableBytes(Call->getContext(), 64)); + Call->addRetAttr(Attribute::getWithAlignment(Call->getContext(), Align(4))); + return Call; +} + +Value *implicit_arg_ptr(IRBuilder<> &Builder) { + CallInst *Call = + Builder.CreateIntrinsic(Intrinsic::amdgcn_implicitarg_ptr, {}, {}); + Call->addRetAttr( + Attribute::getWithDereferenceableBytes(Call->getContext(), 256)); + Call->addRetAttr(Attribute::getWithAlignment(Call->getContext(), Align(8))); + return Call; +} + +template <unsigned Index> +Value *grid_size(Module &M, IRBuilder<> &Builder, Intrinsic::ID, CallBase *) { + auto &Ctx = M.getContext(); + const unsigned XOffset = 12; + auto *DP = dispatch_ptr(Builder); + + // Indexing the HSA kernel_dispatch_packet struct. + auto *Offset = ConstantInt::get(Type::getInt32Ty(Ctx), XOffset + Index * 4); + auto *GEP = Builder.CreateGEP(Type::getInt8Ty(Ctx), DP, Offset); + auto *LD = Builder.CreateLoad(Type::getInt32Ty(Ctx), GEP); + llvm::MDBuilder MDB(Ctx); + // Known non-zero. + LD->setMetadata(llvm::LLVMContext::MD_range, + MDB.createRange(APInt(32, 1), APInt::getZero(32))); + LD->setMetadata(llvm::LLVMContext::MD_invariant_load, + llvm::MDNode::get(Ctx, {})); + return LD; +} + +template <int Index> +Value *WGSize(Module &M, IRBuilder<> &Builder, Intrinsic::ID from, + CallBase *CI) { + + // Note: "__oclc_ABI_version" is supposed to be emitted and initialized by + // clang during compilation of user code. + StringRef Name = "__oclc_ABI_version"; + auto *ABIVersionC = M.getNamedGlobal(Name); + if (!ABIVersionC) { + // In CGBuiltin, we'd have to create an extern variable to emit the load for + // Here, we can leave the intrinsic in place and it'll get lowered later + return nullptr; + } ---------------- JonChesterfield wrote:
It isn't. That's what the return branch means. No magic variable saying where the value is to be found, leave the intrinsic in place until that variable shows up later. If it never shows up, well, we haven't chosen a code object format and can't codegen. https://github.com/llvm/llvm-project/pull/131190 _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits